Python ArxivParser Examples, pyingest.parsers.arxiv.ArxivParser Python Examples

Example #1

0

Show file

File: test_integration.py Project: seasidesparrow/adsabs-pyingest

 def test_arxiv_to_classic(self):
     testfiles = glob.glob(
         os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai*'))
     shouldbe = [
         f.replace('/oai', '/tagged/oai') + '.tagged' for f in testfiles
     ]
     for f, b in zip(testfiles, shouldbe):
         # Python 3 orders the properties dictionary differently
         if sys.version_info > (3, ) and os.path.exists(
                 b.replace('/tagged/oai', '/tagged/python3/oai')):
             b = b.replace('/tagged/oai', '/tagged/python3/oai')
         if sys.version_info > (3, ):
             open_mode = 'rb'
         else:
             open_mode = 'rU'
         with open(f, open_mode) as fp:
             serializer = classic.Tagged()
             outputfp = StringIO()
             parser = arxiv.ArxivParser()
             document = parser.parse(fp)
             serializer.write(document, outputfp)
             testoutput = outputfp.getvalue()
             outputfp.close()
             if sys.version_info > (3, ):
                 read_mode = 'r'
             else:
                 read_mode = 'rU'
             with open(b, read_mode) as bp:
                 self.assertEqual(testoutput, bp.read())

Example #2

0

Show file

 def test_old_style_subjects(self):
     testfiles = [os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_astro-ph_9501013'),
                  os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_math_0306266'),
                  os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_hep-th_0408048'),
                  os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_cond-mat_9706061')]
     shouldbe = [{'bibcode': u'1995astro.ph..1013H'}, {'bibcode': u'2003math......6266C'}, {'bibcode': u'2004hep.th....8048S'}, {'bibcode': u'1997cond.mat..6061A'}]
     for f, b in zip(testfiles, shouldbe):
         with open(f, 'rU') as fp:
             parser = arxiv.ArxivParser()
             document = parser.parse(fp)
             self.assertEqual(document['bibcode'], b['bibcode'])

Example #3

0

Show file

 def test_parsing(self):
     shouldbe = {'authors': u'Luger, Rodrigo; Lustig-Yaeger, Jacob; Agol, Eric',
                 'title': u'Planet-Planet Occultations in TRAPPIST-1 and Other Exoplanet Systems',
                 'bibcode': u'2017arXiv171105739L'}
     with open(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_1711_05739'), 'rU') as fp:
         parser = arxiv.ArxivParser()
         document = parser.parse(fp)
     for k in shouldbe.keys():
         self.assertEqual(shouldbe[k], document[k])
     shouldbe['title'] = 'Paper that has nothing to do with TRAPPIST-1'
     self.assertNotEqual(shouldbe['title'], document['title'])

Example #4

0

Show file

File: test_integration.py Project: kelockhart/adsabs-pyingest

 def test_arxiv_to_classic(self):
     testfiles = glob.glob(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai*'))
     shouldbe = [f.replace('/oai', '/tagged/oai') + '.tagged' for f in testfiles]
     for f, b in zip(testfiles, shouldbe):
         with open(f, 'rU') as fp:
             serializer = classic.Tagged()
             outputfp = cStringIO.StringIO()
             parser = arxiv.ArxivParser()
             document = parser.parse(fp)
             serializer.write(document, outputfp)
             testoutput = outputfp.getvalue()
             outputfp.close()
             with open(b, 'rU') as bp:
                 self.assertEqual(testoutput, bp.read())

Example #5

0

Show file

#            print(document)
##           for k in document.keys():
##               print k,type(document[k])
#
#            serializer = pyingest.serializers.classic.Tagged()
#            outputfp = open('aps.tag','a')
#            serializer.write(document,outputfp)
#            outputfp.close()
#    except:
#        print "ERROR!\n%s\n"%f
#        traceback.print_exc()
#        pass
#    else:
#        pass
#        print "OK:",f

testfile = glob.glob('test_data/arxiv.test/oai*')
for f in testfile:
    try:
        with open(f, 'rU') as fp:
            parser = arxiv.ArxivParser()
            document = parser.parse(fp)
            serializer = pyingest.serializers.classic.Tagged()
            outputfp = open('arxiv.tag', 'a')
            serializer.write(document, outputfp)
            outputfp.close()
    except:
        print "ERROR!\n%s\n" % f
        traceback.print_exc()
        pass

Example #6

0

Show file

 def test_unicode_init(self):
     shouldbe = {'bibcode': u'2009arXiv0901.2443O'}
     with open('test_data/arxiv.test/oai_ArXiv.org_0901_2443', 'rU') as fp:
         parser = arxiv.ArxivParser()
         document = parser.parse(fp)
         self.assertEqual(document['bibcode'], shouldbe['bibcode'])

Example #7

0

Show file

 def test_bad_xml(self):
     with self.assertRaises(arxiv.EmptyParserException):
         with open('test_data/arxiv.test/readme.txt', 'rU') as fp:
             parser = arxiv.ArxivParser()
             document = parser.parse(fp)

Example #8

0

Show file

 def test_bad_xml(self):
     with self.assertRaises(arxiv.EmptyParserException):
         with open(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/readme.txt'), 'rU') as fp:
             parser = arxiv.ArxivParser()
             document = parser.parse(fp)

Example #9

0

Show file

File: test_parsers.py Project: golnazads/adsabs-pyingest

 def test_unicode_init(self):
     shouldbe = {'bibcode': u'2009arXiv0901.2443O'}
     with open(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_0901_2443'), open_mode_u) as fp:
         parser = arxiv.ArxivParser()
         document = parser.parse(fp)
         self.assertEqual(document['bibcode'], shouldbe['bibcode'])

Example #10

0

Show file

File: run.py Project: marblestation/myADSPipeline

def _arxiv_ingest_complete(date=None, sleep_delay=60, sleep_timeout=7200):
    """
    Check if new arXiv records are in Solr - run before running myADS processing
    :param date: date to check arXiv records for; default is set by days-delta from today in config (times in local time)
    :param sleep_delay: number of seconds to sleep between retries
    :param sleep_timeout: number of seconds to retry in total before timing out completely
    :return: test bibcode or None
    """

    if not date:
        date = (datetime.datetime.today() - datetime.timedelta(
            days=config.get('ARXIV_TIMEDELTA_DAYS'))).strftime('%Y-%m-%d')
    else:
        date = get_date(date).strftime('%Y-%m-%d')

    arxiv_file = config.get(
        'ARXIV_UPDATE_AGENT_DIR') + '/UpdateAgent.out.' + date + '.gz'

    arxiv_records = []
    try:
        with gzip.open(arxiv_file, 'r') as flist:
            for l in flist.readlines():
                # sample line: oai/arXiv.org/0706/2491 2018-06-13T01:00:29
                arxiv_records.append(l.split()[0])
    except IOError:
        logger.warning('arXiv ingest file not found. Exiting.')
        return None

    arxiv_records.sort()

    # get the highest numbered ID
    is_new = False
    while is_new is False:
        last_record = arxiv_records.pop()
        try:
            test_new = float(last_record.split('/')[-2])
            is_new = True
        except ValueError:
            continue

    # get the most recent record, convert to a filename
    last_file = config.get('ARXIV_INCOMING_ABS_DIR') + '/' + last_record

    arxiv_parser = arxiv.ArxivParser()
    try:
        with open(last_file, 'rU') as fp:
            try:
                arxiv_record = arxiv_parser.parse(fp)
            except Exception:
                # could also try to parse another record instead of failing
                logger.exception('Bad arXiv record: {0}'.format(last_file))
                return None
    except IOError:
        logger.warning('Individual arXiv ingest file not found. Exiting.')
        return None

    try:
        last_bibc = arxiv_record.get('bibcode')
    except Exception:
        # could also try to parse another record instead of failing
        logger.exception(
            'No bibcode found in arXiv record: {0}'.format(arxiv_record))
        return None

    total_delay = 0
    while total_delay < sleep_timeout:
        total_delay += sleep_delay
        r = app.client.get(
            '{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format(
                config.get('API_SOLR_QUERY_ENDPOINT'), last_bibc),
            headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')})
        if r.status_code != 200:
            time.sleep(sleep_delay)
            logger.error(
                'Error retrieving bibcode {0} from Solr ({1} {2}), retrying'.
                format(last_bibc, r.status_code, r.text))
            continue

        numfound = r.json()['response']['numFound']
        if numfound == 0:
            # nothing found, try again after a sleep
            time.sleep(sleep_delay)
            logger.info(
                'arXiv ingest not complete (test arXiv bibcode: {0}). Sleeping {1}s, for a total delay of {2}s.'
                .format(last_bibc, sleep_delay, total_delay))
            continue
        if numfound > 1:
            # returning this as true for now, since technically something was found
            logger.error(
                'Too many records returned for bibcode {0}'.format(last_bibc))

        logger.info(
            'Numfound: {0} for test bibcode {1}. Response: {2}. URL: {3}'.
            format(numfound, last_bibc, json.dumps(r.json()), r.url))

        # check number of bibcodes from ingest
        if get_date().weekday() == 0:
            start_date = (get_date() - datetime.timedelta(days=3)).date()
        else:
            start_date = (get_date() - datetime.timedelta(days=1)).date()
        beg_pubyear = (get_date() - datetime.timedelta(days=180)).year
        q = app.client.get(
            '{0}?q={1}'.format(
                config.get('API_SOLR_QUERY_ENDPOINT'),
                urllib.quote_plus('bibstem:arxiv entdate:["{0}Z00:00" TO NOW] '
                                  'pubdate:[{1}-00 TO *]'.format(
                                      start_date, beg_pubyear))),
            headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')})
        logger.info('Total number of arXiv bibcodes ingested: {}'.format(
            q.json()['response']['numFound']))

        return last_bibc

    logger.warning(
        'arXiv ingest did not complete within the {0}s timeout limit. Exiting.'
        .format(sleep_timeout))

    return None