def readRecords(records,LOGGER=settings.LOGGER): ''' records: [(bibcode,JSON_fingerprint),...] ''' h = hash(json.dumps(records)) if not records: LOGGER.debug("No records given") return [] targets = dict(records) s = time.time() records = ADSRecords('full','XML') failures = [] for bibcode in targets.keys(): try: records.addCompleteRecord(bibcode) except KeyboardInterrupt: raise except: failures.append(bibcode) LOGGER.warning("[%s] ADSRecords failed" % bibcode) records = records.export() if not records.content: return [] ttc = time.time()-s rate = len(targets)/ttc if failures: LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures)) LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)\t[%s]' % (ttc,len(targets),rate,h)) records = ensureList(xmltodict.parse(records.__str__())['records']['record']) assert(len(records)==len(targets)-len(failures)) # with open('%s.pickle' % uuid.uuid4(),'w') as fp: # pickle.dump(records,fp) return records,targets
def updateRecords(records,LOGGER=settings.LOGGER): if not records: LOGGER.debug("No records given") return [] targets = dict(records) s = time.time() records = ADSRecords('full','XML') failures = [] for bibcode in targets.keys(): try: records.addCompleteRecord(bibcode) except KeyboardInterrupt: raise except: failures.append(bibcode) LOGGER.warning("[%s] ADSRecords failed" % bibcode) records = records.export() if not records.content: return [] ttc = time.time()-s rate = len(targets)/ttc if failures: LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures)) LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)' % (ttc,len(targets),rate)) records = ensureList(xmltodict.parse(records.__str__())['records']['record']) with open('raw.txt','a') as fp: for r in records: fp.write('%s' % r) fp.write('\n\n') assert(len(records)==len(targets)-len(failures)) #Could send these tasks out on a queue completeRecords = [] for r in records: #Define top-level schema that will go in mongo cr = { 'bibcode': r['@bibcode'], 'JSON_fingerprint': targets[r['@bibcode']], 'metadata' : {}, } #Find metadata blocks that need merging metadataCounter = collections.Counter([entry['@type'] for entry in r['metadata']]) needsMerging = dict([(k,[]) for k,v in metadataCounter.iteritems() if v>1]) #Iterate over metadata blocks; directly input single defined blocks #and build a 'needsMerging' list to merge in the next step for metadataBlock in r['metadata']: for field,data in metadataBlock.iteritems(): if field in NORMALIZE_SCHEMA: metadataBlock[field] = NORMALIZE_SCHEMA[field](data) if metadataBlock['@type'] not in needsMerging: cr['metadata'].update({metadataBlock['@type']:metadataBlock}) else: #If it shows up more than once, it needs merging. needsMerging[metadataBlock['@type']].append(metadataBlock) #Now merge the multiply defined metadataBlocks for entryType,data in needsMerging.iteritems(): cr['metadata'].update({entryType:merge(data,r['@bibcode'],entryType,LOGGER)}) #Finally, we have a complete record completeRecords.append(enforceSchema(cr)) LOGGER.info('Added %s complete records' % len(completeRecords)) return completeRecords