def test_canonicalize_records(self): from lib import ReadRecords records = OrderedDict([ ('2014arXiv1401.2993T','b'), #This is an alternate to 'f' ('2014MNRAS.439.1884T','f'), #This is the canonical of 'b' ('2013MNRAS.434.1889H','d'), #This is the canonical of 'g' ('2013arXiv1306.3186H','g'), #This is the alternate of 'd' ('1978Natur.275..624M','c'), #No alternates, already canonical ('1988ESASP.281b.287G','x1'), #Canonical, the following are alternates ('1988IUE88...2..287G','a1'), ('1988IUES....1..287G','a2'), ('1988uvai....2..287G','a3'), ('2014PhRvD..90d4013F','h'), #This is the canonical of 'h' ('2013arXiv1311.6899F','k'), #This it the alternate of 'k' ]) expected = [ ('2014MNRAS.439.1884T', 'b;f'), ('2013MNRAS.434.1889H', 'd;g'), ('1978Natur.275..624M', 'c'), ('1988ESASP.281b.287G','a1;a2;a3;x1'), ('2014PhRvD..90d4013F','h;k'), ] results = ReadRecords.canonicalize_records(OrderedDict((k,v) for k,v in records.iteritems())) self.assertEqual(results, expected)
def main(MONGO=MONGO,*args): if args: sys.argv.extend(*args) parser = argparse.ArgumentParser() parser.add_argument( '--target-bibcodes', nargs='*', default=[], dest='targetBibcodes', help='Only analyze the specified bibcodes, and ignore their JSON fingerprints. Only works when --async=False. Use the syntax @filename.txt to read these from file (1 bibcode per file)' ) parser.add_argument( '--async', default=False, action='store_true', dest='async', help='start in async mode' ) parser.add_argument( '--dont-init-lookers-cache', default=False, action='store_true', dest='dont_init_lookers_cache', help='dont call ADSExports2.init_lookers_cache()' ) parser.add_argument( '--load-records-from-pickle', nargs='*', default=None, dest='load_records_from_pickle', help='Load XML records from a pickle instead of ADSExports', ) parser.add_argument( '--dump-output-to-file', nargs=1, type=str, default=None, dest='outfile', help='Output records to a file' ) parser.add_argument( '--ignore-json-fingerprints', default=False, action='store_true', dest='ignore_json_fingerprints', help='ignore json fingerprints when finding new records to update (ie, force update)', ) parser.add_argument( '--process-deletions', default=False, action='store_true', dest='process_deletions', help='Find orphaned bibcodes in the mongodb, then send these bibcodes to delete via rabbitMQ. No updates will be processed with this flag is set.', ) parser.add_argument( '--max-deletions', default=2000, type=int, dest='max_deletions', help='Maximum number of deletions to attempt; If over this limit, exit and log an error', ) args = parser.parse_args() if not args.dont_init_lookers_cache: start = time.time() logger.info("Calling init_lookers_cache()") ReadRecords.INIT_LOOKERS_CACHE() logger.info("init_lookers_cache() returned in %0.1f sec" % (time.time()-start)) records = readBibcodesFromFile(BIBCODE_FILES) targets = None if args.targetBibcodes: if args.targetBibcodes[0].startswith('@'): with open(args.targetBibcodes[0].replace('@','')) as fp: targetBibcodes = deque([L.strip() for L in fp.readlines() if L and not L.startswith('#')]) else: targetBibcodes = args.targetBibcodes targets = {bibcode:records[bibcode] for bibcode in targetBibcodes} records = deque(ReadRecords.canonicalize_records(records,targets)) total = float(len(records)) #Save to print later if args.ignore_json_fingerprints: records = deque([(r[0],'ignore') for r in records]) if args.process_deletions: start = time.time() logger.info("Processing deletions. This will block for several hours until the database is compared, then exit.") logger.warning("No updates will be processed when --process-deletions is set") mongo = MongoConnection.PipelineMongoConnection(**MONGO) mongo.close() results = mongo.getAllBibcodes() if len(results) != mongo.db[mongo.collection].count(): logger.warning("len getAllBibcodes (%s) != len count (%s). Continue anyways." % (len(results),mongo.db[mongo.collection].count())) records = [i[0] for i in records] payload = list(set(results).difference(set(records))) if len(payload) > args.max_deletions: logger.critical("|".join(payload)) logger.critical("Too many deletions: {} > {}".format(len(payload), args.max_deletions)) sys.exit(1) w = RabbitMQWorker() w.connect(psettings.RABBITMQ_URL) publish(w,payload,routing_key='DeletionRoute') logger.info("Found %s orphaned bibcodes in %0.1f seconds." % (len(payload),time.time()-start)) sys.exit(0) if not args.async: mongo = MongoConnection.PipelineMongoConnection(**MONGO) records = mongo.findNewRecords(records) if args.load_records_from_pickle: records = ReadRecords.readRecordsFromPickles(records,args.load_records_from_pickle) else: records = ReadRecords.readRecordsFromADSExports(records) merged = UpdateRecords.mergeRecords(records) if args.outfile: with open(args.outfile[0],'w') as fp: r = {'merged': merged, 'nonmerged': records} json.dump(r,fp,indent=1) else: bibcodes = mongo.upsertRecords(merged) #SolrUpdater.solrUpdate(bibcodes) elif args.async: w = RabbitMQWorker() w.connect(psettings.RABBITMQ_URL) lastLogged = None while records: payload = [] while len(payload) < BIBCODES_PER_JOB: try: payload.append( records.popleft() ) except IndexError: break percent = round((1-len(records)/total)*100.0) if not percent % 5 and percent!=lastLogged: lastLogged=percent logger.info("There are %s records left (%0.1f%% completed)" % (len(records),percent)) publish(w,payload)