Example #1
0
def do_single_file(file_name, print_xml, write_xml_dir):
    arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name)
    if arxiv_id is None:
        print('WARN: could not determine arXiv identifier for', file_name)
        arxiv_id = '<unknown>'
        arxiv_version = 0

    Log.reset()
    Statistics.begin_item(arxiv_id)

    if file_has_suffix(file_name, '.pdf'):
        Statistics.count('1) pdf')
        succ = True
    else:
        Statistics.count('2) processed')

        parser = process_article(file_name)

        if parser is not None:
            succ = parser['success']
            bib_refs = parser['refs']
        else:
            succ = False
            bib_refs = []

        if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']):
            Statistics.count('hep-processed')
            if succ:
                Statistics.count('hep-success')
        if succ:
            print('-success--------')
            Statistics.count('3) success')
        else:
            print('-fail-----------')
            Statistics.count('4) fail')

        show_ref = False

        if succ and show_ref:
            for bib_ref in bib_refs:
                print(bib_ref.key, 'with', bib_ref.cite_count,
                      'citations in paper')
                if len(bib_ref.bib_info) == 0:
                    print('no reference')
                else:
                    print(bib_ref.bib_info_as_str(keep_comments=True))

        if succ and (print_xml or write_xml_dir):
            xml = Element('article')
            SubElement(xml, 'id').text = arxiv_id
            if arxiv_version > 0:
                SubElement(xml, 'version').text = str(arxiv_version)
            refs = SubElement(xml, 'refs')
            for bib_ref in bib_refs:
                bib_text = bib_ref.bib_info_as_str(keep_comments=True)
                if len(bib_text) != 0:
                    ncites = bib_ref.cite_count
                    if ncites < 1:
                        ncites = 1
                    ref = SubElement(refs,
                                     'ref',
                                     order=str(bib_ref.ref_order_num),
                                     freq=str(ncites))
                    ref.text = bib_text
            if print_xml:
                print(tostring(xml))
            if isinstance(write_xml_dir, str):
                if arxiv_id != '<unknown>':
                    xml_file_name = os.path.join(
                        write_xml_dir,
                        arxiv_id.replace('/', '') + '.xml')
                else:
                    fname = os.path.split(file_name)[1]
                    if fname.rfind('.') > 0:
                        fname = fname[:fname.rfind('.')]
                    xml_file_name = write_xml_dir + '/' + fname + '.xml'
                file_obj = open(xml_file_name, 'wb')
                file_obj.write(tostring(xml, encoding='utf-8'))
                file_obj.close()

    Statistics.end_item()

    return succ
Example #2
0
def do_single_file(file_name, print_xml, write_xml_dir):
    arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name)
    if arxiv_id is None:
        print('WARN: could not determine arXiv identifier for', file_name)
        arxiv_id = '<unknown>'
        arxiv_version = 0

    Log.reset()
    Statistics.begin_item(arxiv_id)

    if file_has_suffix(file_name, '.pdf'):
        Statistics.count('1) pdf')
        succ = True
    else:
        Statistics.count('2) processed')

        parser = process_article(file_name)

        if parser is not None :
            succ = parser['success']
            bib_refs = parser['refs']
        else :
            succ = False
            bib_refs = []

        if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']):
            Statistics.count('hep-processed')
            if succ:
                Statistics.count('hep-success')
        if succ:
            print('-success--------')
            Statistics.count('3) success')
        else:
            print('-fail-----------')
            Statistics.count('4) fail')

        show_ref = False

        if succ and show_ref:
            for bib_ref in bib_refs:
                print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper')
                if len(bib_ref.bib_info) == 0:
                    print('no reference')
                else:
                    print(bib_ref.bib_info_as_str(keep_comments=True))

        if succ and (print_xml or write_xml_dir):
            xml = Element('article')
            SubElement(xml, 'id').text = arxiv_id
            if arxiv_version > 0:
                SubElement(xml, 'version').text = str(arxiv_version)
            refs = SubElement(xml, 'refs')
            for bib_ref in bib_refs:
                bib_text = bib_ref.bib_info_as_str(keep_comments=True)
                if len(bib_text) != 0:
                    ncites = bib_ref.cite_count
                    if ncites < 1:
                        ncites = 1
                    ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites))
                    ref.text = bib_text
            if print_xml:
                print(tostring(xml))
            if isinstance(write_xml_dir, str):
                if arxiv_id != '<unknown>':
                    xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml')
                else:
                    fname = os.path.split(file_name)[1]
                    if fname.rfind('.') > 0:
                        fname = fname[:fname.rfind('.')]
                    xml_file_name = write_xml_dir + '/' + fname + '.xml'
                file_obj = open(xml_file_name, 'wb')
                file_obj.write(tostring(xml, encoding='utf-8'))
                file_obj.close()

    Statistics.end_item()

    return succ
Example #3
0
        metavar='<dir>',
        help='destination directory to write XML output files')
    cmd_parser.add_argument('--failed',
                            metavar='<file>',
                            help='output file to write list of failed files')
    cmd_parser.add_argument('files', nargs='+', help='input files')
    args = cmd_parser.parse_args()

    # print date stamp
    timeStart = datetime.datetime.now()
    print('[ptex] started processing at', str(timeStart))

    print('given', len(args.files), 'files, first file:', args.files[0])
    print('================')

    Statistics.clear('article')

    # build list of files to process
    file_list = buildFileList(args.filelist, args.files)

    # ensure the destination directory exists
    if args.write_xml is not None and os.path.exists(args.write_xml):
        try:
            os.makedirs(args.write_xml)
        except:
            pass

    # process the files
    failed_files = []
    for file_name in file_list:
        success = do_single_file(file_name, args.print_xml, args.write_xml)
Example #4
0
    cmd_parser = argparse.ArgumentParser(description='Parse TeX/LaTeX to find references.')
    cmd_parser.add_argument('--filelist', action='store_true', help='file names on the command line each contain a list of files to process')
    cmd_parser.add_argument('--print-xml', action='store_true', help='print XML output to stdout')
    cmd_parser.add_argument('--write-xml', metavar='<dir>', help='destination directory to write XML output files')
    cmd_parser.add_argument('--failed', metavar='<file>', help='output file to write list of failed files')
    cmd_parser.add_argument('files', nargs='+', help='input files')
    args = cmd_parser.parse_args()

    # print date stamp
    timeStart = datetime.datetime.now()
    print('[ptex] started processing at', str(timeStart))

    print('given', len(args.files), 'files, first file:', args.files[0])
    print('================')

    Statistics.clear('article')

    # build list of files to process
    file_list = buildFileList(args.filelist, args.files)

    # ensure the destination directory exists
    if args.write_xml is not None and os.path.exists(args.write_xml):
        try:
            os.makedirs(args.write_xml)
        except:
            pass

    # process the files
    failed_files = []
    for file_name in file_list:
        success = do_single_file(file_name, args.print_xml, args.write_xml)