Esempio n. 1
0
def single_input(args, config=None):
    """
    Single Input Mode works to convert a single input XML file into EPUB.

    This is probably the most typical use case and is the most highly
    configurable, see the argument parser and oaepub --help
    """
    if config is None:
        config = get_config_module()
    #Determination of input type and processing
    #Fetch by URL
    if 'http:' in args.input:
        raw_name = u_input.url_input(args.input)
        abs_input_path = os.path.join(LOCAL_DIR, raw_name+'.xml')
        parsed_article = Article(abs_input_path, validation=args.no_dtd_validation)
    #Fetch by DOI
    elif args.input[:4] == 'doi:':
        raw_name = u_input.doi_input(args.input)
        abs_input_path = os.path.join(LOCAL_DIR, raw_name+'.xml')
        parsed_article = Article(abs_input_path, validation=args.no_dtd_validation)
    #Local XML input
    else:
        abs_input_path = utils.get_absolute_path(args.input)
        raw_name = u_input.local_input(abs_input_path)
        parsed_article = Article(abs_input_path, validation=args.no_dtd_validation)

    #Generate the output path name, this will be the directory name for the
    #output. This output directory will later be zipped into an EPUB
    output_name = os.path.join(utils.get_output_directory(args), raw_name)

    #Make the EPUB
    make_epub(parsed_article,
              outdirect=output_name,
              explicit_images=args.images,   # Explicit image path
              batch=False,
              config=config)

    #Cleanup removes the produced output directory, keeps the ePub file
    if args.clean:  # Defaults to False, --clean or -c to toggle on
        shutil.rmtree(output_name)

    #Running epubcheck on the output verifies the validity of the ePub,
    #requires a local installation of java and epubcheck.
    if args.no_epubcheck:
        epubcheck('{0}.epub'.format(output_name), config)
Esempio n. 2
0
def collection_input(args, config=None):
    """
    Collection Input Mode works to compile multiple articles into a single
    composite ePub. This is akin to such formats as Collections, Issues, and
    Omnibus; it may also be useful for those interested in the simple
    distribution of a reading list, personal publications, or topic reference.

    Collection Input Mode produces output that is necessarily unlike the output
    generated by Single or Batch (which is just sequential Single) input modes.
    The primary difference in output lies with the ePub metadata; as applying
    metadata from any single article to the whole would be inappropriate.

    Unlike other input modes, Collection Mode is strictly dependent on the
    local directory of execution. If there is a file named "order.txt" in the
    local directory, this file should contain the name of one input XML file
    on each line; the files will be added to the ePub output by line-order.
    If there is "order.txt" file, Collection Mode will assume that all XML
    files are input and the article in order in the collection will be random.

    Collection Input Mode has default epubcheck behavior, it will place a system
    call to epubcheck unless specified otherwise (--no-epubcheck or -N flags).
    """
    if config is None:
        config = get_config_module()
    try:
        order = open('order.txt', 'r')
    except IOError:  # No order.txt
        xml_files = list_xml_files(dir=os.getcwd())
    else:
        #Add all nonempty lines, in order, to the xml_files list
        xml_files = [i.strip() for i in order.readlines() if i.strip()]
        order.close()

    #The output name will be the same as the parent directory name
    #This will also serve as the dc:title
    output_name = os.path.split(os.getcwd())[1]

    #The standard make_epub() method will not work for Collection Mode
    #So the work done here is an adaptation of it
    print('Processing output to {0}.epub'.format(output_name))
    #Copy files from base_epub to the new output
    if os.path.isdir(output_name):
        dir_exists(output_name)
    epub_base = os.path.join(CACHE_LOCATION, 'base_epub')
    shutil.copytree(epub_base, output_name)

    if args.collection is True:
        try:
            title_txt = open('title.txt', 'r')
        except IOError:  # No title.txt
            title = output_name
        else:
            title = title_txt.readline().strip()
            title_txt.close()
            if not title:
                title = output_name
                print('title.txt was empty or title was not on first line!')
                print('Defaulting to name of parent directory. {0}'.format(title))
    else:
        title = args.collection
    
    toc = ncx.NCX(oae_version=__version__, location=output_name, collection_mode=True)
    myopf = opf.OPF(location=output_name, collection_mode=True, title=title)

    #Now it is time to operate on each of the xml files
    for xml_file in xml_files:
        raw_name = u_input.local_input(xml_file)  # is this used?
        parsed_article = Article(xml_file, validation=args.no_dtd_validation)
        toc.take_article(parsed_article)
        myopf.take_article(parsed_article)
    
        #Get the Digital Object Identifier
        doi = parsed_article.get_DOI()
        journal_doi, article_doi = doi.split('/')
        
        #Check for images
        img_dir = os.path.join(output_name, 'OPS', 'images-{0}'.format(article_doi))
        expected_local = 'images-{0}'.format(raw_name)
        if os.path.isdir(expected_local):
            utils.images.local_images(expected_local, img_dir)
        else:
            article_cache = os.path.join(config.image_cache, journal_doi, article_doi)
            if os.path.isdir(article_cache):
                utils.images.image_cache(article_cache, img_dir)
            else:
                print('Images for {0} (DOI: {1}) could not be found!'.format(xml_file, doi))
                r = input('Try to download them? [Y/n]')
                if r in ['y', 'Y', '']:
                    os.mkdir(img_dir)
                    utils.images.fetch_plos_images(article_doi, img_dir, parsed_article)
                    if config.use_image_cache:
                        utils.images.move_images_to_cache(img_dir, article_cache)
                else:
                    sys.exit(1)

        #TODO: Content stuff
        if journal_doi == '10.1371':  # PLoS's publisher DOI
            ops_doc = ops.OPSPLoS(parsed_article, output_name)
            #TODO: Workflow change, parse table of contents from OPS processed document
            
    toc.write()
    myopf.write()
    utils.epub_zip(output_name)
    

    #Running epubcheck on the output verifies the validity of the ePub,
    #requires a local installation of java and epubcheck.
    if args.no_epubcheck:
        epubcheck('{0}.epub'.format(output_name), config)
Esempio n. 3
0
def batch_input(args, config=None):
    """
    Batch Input Mode works to convert all of the article XML files in a
    specified directory into individual article EPUB files.

    Batch Input Mode is employed under a few simplifying assumptions: any
    pre-existing folder for article EPUB conversion will be eliminated without
    asking user permission, all output that except the .epub and .log files
    will be removed, and image files in a custom directory are not being used.

    Unlike the other input modes, Batch Input Mode output is always relative to
    the batch directory rather than the working directory of oaepub execution.

    Batch Input Mode has default epubcheck behavior, it will place a system
    call to epubcheck unless specified otherwise (--no-epubcheck or -N flags).
    """
    if config is None:
        config = get_config_module()
    error_file = open('batch_tracebacks.txt', 'w')
    #Iterate over all listed files in the batch directory
    for item in os.listdir(args.batch):
        item_path = os.path.join(args.batch, item)
        #Skip directories and files without .xml extension
        _root, extension = os.path.splitext(item)
        if not os.path.isfile(item_path):
            continue
        if not extension == '.xml':
            continue
        print(item_path)

        #Parse the article
        try:
            raw_name = u_input.local_input(item_path)
        except:
            traceback.print_exc(file=error_file)
        else:
            parsed_article = Article(os.path.join(args.batch, raw_name+'.xml'),
                                     validation=args.no_dtd_validation)

        #Create the output name
        output_name = os.path.join(utils.get_output_directory(args), raw_name)

        #Make the EPUB
        try:
            make_epub(parsed_article,
                      outdirect=output_name,
                      explicit_images=None,   # No explicit image path
                      batch=True,
                      config=config)
        except:
            error_file.write(item_path + '\n')
            traceback.print_exc(file=error_file)

        #Cleanup output directory, keeps EPUB and log
        shutil.rmtree(output_name)

        #Running epubcheck on the output verifies the validity of the ePub,
        #requires a local installation of java and epubcheck.
        if args.no_epubcheck:
            epubcheck('{0}.epub'.format(output_name), config)
    error_file.close()