def test_arxiv_to_classic(self): testfiles = glob.glob( os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai*')) shouldbe = [ f.replace('/oai', '/tagged/oai') + '.tagged' for f in testfiles ] for f, b in zip(testfiles, shouldbe): # Python 3 orders the properties dictionary differently if sys.version_info > (3, ) and os.path.exists( b.replace('/tagged/oai', '/tagged/python3/oai')): b = b.replace('/tagged/oai', '/tagged/python3/oai') if sys.version_info > (3, ): open_mode = 'rb' else: open_mode = 'rU' with open(f, open_mode) as fp: serializer = classic.Tagged() outputfp = StringIO() parser = arxiv.ArxivParser() document = parser.parse(fp) serializer.write(document, outputfp) testoutput = outputfp.getvalue() outputfp.close() if sys.version_info > (3, ): read_mode = 'r' else: read_mode = 'rU' with open(b, read_mode) as bp: self.assertEqual(testoutput, bp.read())
def test_old_style_subjects(self): testfiles = [os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_astro-ph_9501013'), os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_math_0306266'), os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_hep-th_0408048'), os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_cond-mat_9706061')] shouldbe = [{'bibcode': u'1995astro.ph..1013H'}, {'bibcode': u'2003math......6266C'}, {'bibcode': u'2004hep.th....8048S'}, {'bibcode': u'1997cond.mat..6061A'}] for f, b in zip(testfiles, shouldbe): with open(f, 'rU') as fp: parser = arxiv.ArxivParser() document = parser.parse(fp) self.assertEqual(document['bibcode'], b['bibcode'])
def test_parsing(self): shouldbe = {'authors': u'Luger, Rodrigo; Lustig-Yaeger, Jacob; Agol, Eric', 'title': u'Planet-Planet Occultations in TRAPPIST-1 and Other Exoplanet Systems', 'bibcode': u'2017arXiv171105739L'} with open(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_1711_05739'), 'rU') as fp: parser = arxiv.ArxivParser() document = parser.parse(fp) for k in shouldbe.keys(): self.assertEqual(shouldbe[k], document[k]) shouldbe['title'] = 'Paper that has nothing to do with TRAPPIST-1' self.assertNotEqual(shouldbe['title'], document['title'])
def test_arxiv_to_classic(self): testfiles = glob.glob(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai*')) shouldbe = [f.replace('/oai', '/tagged/oai') + '.tagged' for f in testfiles] for f, b in zip(testfiles, shouldbe): with open(f, 'rU') as fp: serializer = classic.Tagged() outputfp = cStringIO.StringIO() parser = arxiv.ArxivParser() document = parser.parse(fp) serializer.write(document, outputfp) testoutput = outputfp.getvalue() outputfp.close() with open(b, 'rU') as bp: self.assertEqual(testoutput, bp.read())
# print(document) ## for k in document.keys(): ## print k,type(document[k]) # # serializer = pyingest.serializers.classic.Tagged() # outputfp = open('aps.tag','a') # serializer.write(document,outputfp) # outputfp.close() # except: # print "ERROR!\n%s\n"%f # traceback.print_exc() # pass # else: # pass # print "OK:",f testfile = glob.glob('test_data/arxiv.test/oai*') for f in testfile: try: with open(f, 'rU') as fp: parser = arxiv.ArxivParser() document = parser.parse(fp) serializer = pyingest.serializers.classic.Tagged() outputfp = open('arxiv.tag', 'a') serializer.write(document, outputfp) outputfp.close() except: print "ERROR!\n%s\n" % f traceback.print_exc() pass
def test_unicode_init(self): shouldbe = {'bibcode': u'2009arXiv0901.2443O'} with open('test_data/arxiv.test/oai_ArXiv.org_0901_2443', 'rU') as fp: parser = arxiv.ArxivParser() document = parser.parse(fp) self.assertEqual(document['bibcode'], shouldbe['bibcode'])
def test_bad_xml(self): with self.assertRaises(arxiv.EmptyParserException): with open('test_data/arxiv.test/readme.txt', 'rU') as fp: parser = arxiv.ArxivParser() document = parser.parse(fp)
def test_bad_xml(self): with self.assertRaises(arxiv.EmptyParserException): with open(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/readme.txt'), 'rU') as fp: parser = arxiv.ArxivParser() document = parser.parse(fp)
def test_unicode_init(self): shouldbe = {'bibcode': u'2009arXiv0901.2443O'} with open(os.path.join(os.path.dirname(__file__), 'data/arxiv.test/oai_ArXiv.org_0901_2443'), open_mode_u) as fp: parser = arxiv.ArxivParser() document = parser.parse(fp) self.assertEqual(document['bibcode'], shouldbe['bibcode'])
def _arxiv_ingest_complete(date=None, sleep_delay=60, sleep_timeout=7200): """ Check if new arXiv records are in Solr - run before running myADS processing :param date: date to check arXiv records for; default is set by days-delta from today in config (times in local time) :param sleep_delay: number of seconds to sleep between retries :param sleep_timeout: number of seconds to retry in total before timing out completely :return: test bibcode or None """ if not date: date = (datetime.datetime.today() - datetime.timedelta( days=config.get('ARXIV_TIMEDELTA_DAYS'))).strftime('%Y-%m-%d') else: date = get_date(date).strftime('%Y-%m-%d') arxiv_file = config.get( 'ARXIV_UPDATE_AGENT_DIR') + '/UpdateAgent.out.' + date + '.gz' arxiv_records = [] try: with gzip.open(arxiv_file, 'r') as flist: for l in flist.readlines(): # sample line: oai/arXiv.org/0706/2491 2018-06-13T01:00:29 arxiv_records.append(l.split()[0]) except IOError: logger.warning('arXiv ingest file not found. Exiting.') return None arxiv_records.sort() # get the highest numbered ID is_new = False while is_new is False: last_record = arxiv_records.pop() try: test_new = float(last_record.split('/')[-2]) is_new = True except ValueError: continue # get the most recent record, convert to a filename last_file = config.get('ARXIV_INCOMING_ABS_DIR') + '/' + last_record arxiv_parser = arxiv.ArxivParser() try: with open(last_file, 'rU') as fp: try: arxiv_record = arxiv_parser.parse(fp) except Exception: # could also try to parse another record instead of failing logger.exception('Bad arXiv record: {0}'.format(last_file)) return None except IOError: logger.warning('Individual arXiv ingest file not found. Exiting.') return None try: last_bibc = arxiv_record.get('bibcode') except Exception: # could also try to parse another record instead of failing logger.exception( 'No bibcode found in arXiv record: {0}'.format(arxiv_record)) return None total_delay = 0 while total_delay < sleep_timeout: total_delay += sleep_delay r = app.client.get( '{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format( config.get('API_SOLR_QUERY_ENDPOINT'), last_bibc), headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')}) if r.status_code != 200: time.sleep(sleep_delay) logger.error( 'Error retrieving bibcode {0} from Solr ({1} {2}), retrying'. format(last_bibc, r.status_code, r.text)) continue numfound = r.json()['response']['numFound'] if numfound == 0: # nothing found, try again after a sleep time.sleep(sleep_delay) logger.info( 'arXiv ingest not complete (test arXiv bibcode: {0}). Sleeping {1}s, for a total delay of {2}s.' .format(last_bibc, sleep_delay, total_delay)) continue if numfound > 1: # returning this as true for now, since technically something was found logger.error( 'Too many records returned for bibcode {0}'.format(last_bibc)) logger.info( 'Numfound: {0} for test bibcode {1}. Response: {2}. URL: {3}'. format(numfound, last_bibc, json.dumps(r.json()), r.url)) # check number of bibcodes from ingest if get_date().weekday() == 0: start_date = (get_date() - datetime.timedelta(days=3)).date() else: start_date = (get_date() - datetime.timedelta(days=1)).date() beg_pubyear = (get_date() - datetime.timedelta(days=180)).year q = app.client.get( '{0}?q={1}'.format( config.get('API_SOLR_QUERY_ENDPOINT'), urllib.quote_plus('bibstem:arxiv entdate:["{0}Z00:00" TO NOW] ' 'pubdate:[{1}-00 TO *]'.format( start_date, beg_pubyear))), headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')}) logger.info('Total number of arXiv bibcodes ingested: {}'.format( q.json()['response']['numFound'])) return last_bibc logger.warning( 'arXiv ingest did not complete within the {0}s timeout limit. Exiting.' .format(sleep_timeout)) return None