def do_single_file(file_name, print_xml, write_xml_dir): arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name) if arxiv_id is None: print('WARN: could not determine arXiv identifier for', file_name) arxiv_id = '<unknown>' arxiv_version = 0 Log.reset() Statistics.begin_item(arxiv_id) if file_has_suffix(file_name, '.pdf'): Statistics.count('1) pdf') succ = True else: Statistics.count('2) processed') parser = process_article(file_name) if parser is not None: succ = parser['success'] bib_refs = parser['refs'] else: succ = False bib_refs = [] if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']): Statistics.count('hep-processed') if succ: Statistics.count('hep-success') if succ: print('-success--------') Statistics.count('3) success') else: print('-fail-----------') Statistics.count('4) fail') show_ref = False if succ and show_ref: for bib_ref in bib_refs: print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper') if len(bib_ref.bib_info) == 0: print('no reference') else: print(bib_ref.bib_info_as_str(keep_comments=True)) if succ and (print_xml or write_xml_dir): xml = Element('article') SubElement(xml, 'id').text = arxiv_id if arxiv_version > 0: SubElement(xml, 'version').text = str(arxiv_version) refs = SubElement(xml, 'refs') for bib_ref in bib_refs: bib_text = bib_ref.bib_info_as_str(keep_comments=True) if len(bib_text) != 0: ncites = bib_ref.cite_count if ncites < 1: ncites = 1 ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites)) ref.text = bib_text if print_xml: print(tostring(xml)) if isinstance(write_xml_dir, str): if arxiv_id != '<unknown>': xml_file_name = os.path.join( write_xml_dir, arxiv_id.replace('/', '') + '.xml') else: fname = os.path.split(file_name)[1] if fname.rfind('.') > 0: fname = fname[:fname.rfind('.')] xml_file_name = write_xml_dir + '/' + fname + '.xml' file_obj = open(xml_file_name, 'wb') file_obj.write(tostring(xml, encoding='utf-8')) file_obj.close() Statistics.end_item() return succ
def do_single_file(file_name, print_xml, write_xml_dir): arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name) if arxiv_id is None: print('WARN: could not determine arXiv identifier for', file_name) arxiv_id = '<unknown>' arxiv_version = 0 Log.reset() Statistics.begin_item(arxiv_id) if file_has_suffix(file_name, '.pdf'): Statistics.count('1) pdf') succ = True else: Statistics.count('2) processed') parser = process_article(file_name) if parser is not None : succ = parser['success'] bib_refs = parser['refs'] else : succ = False bib_refs = [] if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']): Statistics.count('hep-processed') if succ: Statistics.count('hep-success') if succ: print('-success--------') Statistics.count('3) success') else: print('-fail-----------') Statistics.count('4) fail') show_ref = False if succ and show_ref: for bib_ref in bib_refs: print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper') if len(bib_ref.bib_info) == 0: print('no reference') else: print(bib_ref.bib_info_as_str(keep_comments=True)) if succ and (print_xml or write_xml_dir): xml = Element('article') SubElement(xml, 'id').text = arxiv_id if arxiv_version > 0: SubElement(xml, 'version').text = str(arxiv_version) refs = SubElement(xml, 'refs') for bib_ref in bib_refs: bib_text = bib_ref.bib_info_as_str(keep_comments=True) if len(bib_text) != 0: ncites = bib_ref.cite_count if ncites < 1: ncites = 1 ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites)) ref.text = bib_text if print_xml: print(tostring(xml)) if isinstance(write_xml_dir, str): if arxiv_id != '<unknown>': xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml') else: fname = os.path.split(file_name)[1] if fname.rfind('.') > 0: fname = fname[:fname.rfind('.')] xml_file_name = write_xml_dir + '/' + fname + '.xml' file_obj = open(xml_file_name, 'wb') file_obj.write(tostring(xml, encoding='utf-8')) file_obj.close() Statistics.end_item() return succ