Example #1
0
def do_single_file(file_name, print_xml, write_xml_dir):
    arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name)
    if arxiv_id is None:
        print('WARN: could not determine arXiv identifier for', file_name)
        arxiv_id = '<unknown>'
        arxiv_version = 0

    Log.reset()
    Statistics.begin_item(arxiv_id)

    if file_has_suffix(file_name, '.pdf'):
        Statistics.count('1) pdf')
        succ = True
    else:
        Statistics.count('2) processed')

        parser = process_article(file_name)

        if parser is not None:
            succ = parser['success']
            bib_refs = parser['refs']
        else:
            succ = False
            bib_refs = []

        if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']):
            Statistics.count('hep-processed')
            if succ:
                Statistics.count('hep-success')
        if succ:
            print('-success--------')
            Statistics.count('3) success')
        else:
            print('-fail-----------')
            Statistics.count('4) fail')

        show_ref = False

        if succ and show_ref:
            for bib_ref in bib_refs:
                print(bib_ref.key, 'with', bib_ref.cite_count,
                      'citations in paper')
                if len(bib_ref.bib_info) == 0:
                    print('no reference')
                else:
                    print(bib_ref.bib_info_as_str(keep_comments=True))

        if succ and (print_xml or write_xml_dir):
            xml = Element('article')
            SubElement(xml, 'id').text = arxiv_id
            if arxiv_version > 0:
                SubElement(xml, 'version').text = str(arxiv_version)
            refs = SubElement(xml, 'refs')
            for bib_ref in bib_refs:
                bib_text = bib_ref.bib_info_as_str(keep_comments=True)
                if len(bib_text) != 0:
                    ncites = bib_ref.cite_count
                    if ncites < 1:
                        ncites = 1
                    ref = SubElement(refs,
                                     'ref',
                                     order=str(bib_ref.ref_order_num),
                                     freq=str(ncites))
                    ref.text = bib_text
            if print_xml:
                print(tostring(xml))
            if isinstance(write_xml_dir, str):
                if arxiv_id != '<unknown>':
                    xml_file_name = os.path.join(
                        write_xml_dir,
                        arxiv_id.replace('/', '') + '.xml')
                else:
                    fname = os.path.split(file_name)[1]
                    if fname.rfind('.') > 0:
                        fname = fname[:fname.rfind('.')]
                    xml_file_name = write_xml_dir + '/' + fname + '.xml'
                file_obj = open(xml_file_name, 'wb')
                file_obj.write(tostring(xml, encoding='utf-8'))
                file_obj.close()

    Statistics.end_item()

    return succ
Example #2
0
def do_single_file(file_name, print_xml, write_xml_dir):
    arxiv_id, arxiv_version = arxivid.filenameToArxivAndVersion(file_name)
    if arxiv_id is None:
        print('WARN: could not determine arXiv identifier for', file_name)
        arxiv_id = '<unknown>'
        arxiv_version = 0

    Log.reset()
    Statistics.begin_item(arxiv_id)

    if file_has_suffix(file_name, '.pdf'):
        Statistics.count('1) pdf')
        succ = True
    else:
        Statistics.count('2) processed')

        parser = process_article(file_name)

        if parser is not None :
            succ = parser['success']
            bib_refs = parser['refs']
        else :
            succ = False
            bib_refs = []

        if str_contains_one_of(arxiv_id, ['gr-qc', 'hep-']):
            Statistics.count('hep-processed')
            if succ:
                Statistics.count('hep-success')
        if succ:
            print('-success--------')
            Statistics.count('3) success')
        else:
            print('-fail-----------')
            Statistics.count('4) fail')

        show_ref = False

        if succ and show_ref:
            for bib_ref in bib_refs:
                print(bib_ref.key, 'with', bib_ref.cite_count, 'citations in paper')
                if len(bib_ref.bib_info) == 0:
                    print('no reference')
                else:
                    print(bib_ref.bib_info_as_str(keep_comments=True))

        if succ and (print_xml or write_xml_dir):
            xml = Element('article')
            SubElement(xml, 'id').text = arxiv_id
            if arxiv_version > 0:
                SubElement(xml, 'version').text = str(arxiv_version)
            refs = SubElement(xml, 'refs')
            for bib_ref in bib_refs:
                bib_text = bib_ref.bib_info_as_str(keep_comments=True)
                if len(bib_text) != 0:
                    ncites = bib_ref.cite_count
                    if ncites < 1:
                        ncites = 1
                    ref = SubElement(refs, 'ref', order=str(bib_ref.ref_order_num), freq=str(ncites))
                    ref.text = bib_text
            if print_xml:
                print(tostring(xml))
            if isinstance(write_xml_dir, str):
                if arxiv_id != '<unknown>':
                    xml_file_name = os.path.join(write_xml_dir, arxiv_id.replace('/', '') + '.xml')
                else:
                    fname = os.path.split(file_name)[1]
                    if fname.rfind('.') > 0:
                        fname = fname[:fname.rfind('.')]
                    xml_file_name = write_xml_dir + '/' + fname + '.xml'
                file_obj = open(xml_file_name, 'wb')
                file_obj.write(tostring(xml, encoding='utf-8'))
                file_obj.close()

    Statistics.end_item()

    return succ