def do_single_file(file_name, print_xml, write_xml_dir): arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name) if arxiv_id is None: print('WARN: could not determine arXiv identifier for', file_name) arxiv_id = '<unknown>' arxiv_version = 0 Log.reset() Statistics.begin_item(arxiv_id) if file_has_suffix(file_name, '.pdf'): Statistics.count('1) pdf') succ = True else: Statistics.count('2) processed') parser = process_article(file_name) if parser is not None: succ = parser['success'] bib_refs = parser['refs'] else: succ = False bib_refs = [] if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']): Statistics.count('hep-processed') if succ: Statistics.count('hep-success') if succ: print('-success--------') Statistics.count('3) success') else: print('-fail-----------') Statistics.count('4) fail') show_ref = False if succ and show_ref: for bib_ref in bib_refs: print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper') if len(bib_ref.bib_info) == 0: print('no reference') else: print(bib_ref.bib_info_as_str(keep_comments=True)) if succ and (print_xml or write_xml_dir): xml = Element('article') SubElement(xml, 'id').text = arxiv_id if arxiv_version > 0: SubElement(xml, 'version').text = str(arxiv_version) refs = SubElement(xml, 'refs') for bib_ref in bib_refs: bib_text = bib_ref.bib_info_as_str(keep_comments=True) if len(bib_text) != 0: ncites = bib_ref.cite_count if ncites < 1: ncites = 1 ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites)) ref.text = bib_text if print_xml: print(tostring(xml)) if isinstance(write_xml_dir, str): if arxiv_id != '<unknown>': xml_file_name = os.path.join( write_xml_dir, arxiv_id.replace('/', '') + '.xml') else: fname = os.path.split(file_name)[1] if fname.rfind('.') > 0: fname = fname[:fname.rfind('.')] xml_file_name = write_xml_dir + '/' + fname + '.xml' file_obj = open(xml_file_name, 'wb') file_obj.write(tostring(xml, encoding='utf-8')) file_obj.close() Statistics.end_item() return succ
def do_single_file(file_name, print_xml, write_xml_dir): arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name) if arxiv_id is None: print('WARN: could not determine arXiv identifier for', file_name) arxiv_id = '<unknown>' arxiv_version = 0 Log.reset() Statistics.begin_item(arxiv_id) if file_has_suffix(file_name, '.pdf'): Statistics.count('1) pdf') succ = True else: Statistics.count('2) processed') parser = process_article(file_name) if parser is not None : succ = parser['success'] bib_refs = parser['refs'] else : succ = False bib_refs = [] if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']): Statistics.count('hep-processed') if succ: Statistics.count('hep-success') if succ: print('-success--------') Statistics.count('3) success') else: print('-fail-----------') Statistics.count('4) fail') show_ref = False if succ and show_ref: for bib_ref in bib_refs: print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper') if len(bib_ref.bib_info) == 0: print('no reference') else: print(bib_ref.bib_info_as_str(keep_comments=True)) if succ and (print_xml or write_xml_dir): xml = Element('article') SubElement(xml, 'id').text = arxiv_id if arxiv_version > 0: SubElement(xml, 'version').text = str(arxiv_version) refs = SubElement(xml, 'refs') for bib_ref in bib_refs: bib_text = bib_ref.bib_info_as_str(keep_comments=True) if len(bib_text) != 0: ncites = bib_ref.cite_count if ncites < 1: ncites = 1 ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites)) ref.text = bib_text if print_xml: print(tostring(xml)) if isinstance(write_xml_dir, str): if arxiv_id != '<unknown>': xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml') else: fname = os.path.split(file_name)[1] if fname.rfind('.') > 0: fname = fname[:fname.rfind('.')] xml_file_name = write_xml_dir + '/' + fname + '.xml' file_obj = open(xml_file_name, 'wb') file_obj.write(tostring(xml, encoding='utf-8')) file_obj.close() Statistics.end_item() return succ
metavar='<dir>', help='destination directory to write XML output files') cmd_parser.add_argument('--failed', metavar='<file>', help='output file to write list of failed files') cmd_parser.add_argument('files', nargs='+', help='input files') args = cmd_parser.parse_args() # print date stamp timeStart = datetime.datetime.now() print('[ptex] started processing at', str(timeStart)) print('given', len(args.files), 'files, first file:', args.files[0]) print('================') Statistics.clear('article') # build list of files to process file_list = buildFileList(args.filelist, args.files) # ensure the destination directory exists if args.write_xml is not None and os.path.exists(args.write_xml): try: os.makedirs(args.write_xml) except: pass # process the files failed_files = [] for file_name in file_list: success = do_single_file(file_name, args.print_xml, args.write_xml)
cmd_parser = argparse.ArgumentParser(description='Parse TeX/LaTeX to find references.') cmd_parser.add_argument('--filelist', action='store_true', help='file names on the command line each contain a list of files to process') cmd_parser.add_argument('--print-xml', action='store_true', help='print XML output to stdout') cmd_parser.add_argument('--write-xml', metavar='<dir>', help='destination directory to write XML output files') cmd_parser.add_argument('--failed', metavar='<file>', help='output file to write list of failed files') cmd_parser.add_argument('files', nargs='+', help='input files') args = cmd_parser.parse_args() # print date stamp timeStart = datetime.datetime.now() print('[ptex] started processing at', str(timeStart)) print('given', len(args.files), 'files, first file:', args.files[0]) print('================') Statistics.clear('article') # build list of files to process file_list = buildFileList(args.filelist, args.files) # ensure the destination directory exists if args.write_xml is not None and os.path.exists(args.write_xml): try: os.makedirs(args.write_xml) except: pass # process the files failed_files = [] for file_name in file_list: success = do_single_file(file_name, args.print_xml, args.write_xml)