Example #1
0
def readRecords(records,LOGGER=settings.LOGGER):
  '''
  records: [(bibcode,JSON_fingerprint),...]
  '''
  h = hash(json.dumps(records))
  if not records:
    LOGGER.debug("No records given")
    return []

  targets = dict(records)

  s = time.time()
  records = ADSRecords('full','XML')
  failures = []
  for bibcode in targets.keys():
    try:
      records.addCompleteRecord(bibcode)
    except KeyboardInterrupt:
      raise
    except:
      failures.append(bibcode)
      LOGGER.warning("[%s] ADSRecords failed" % bibcode)
  records = records.export()
  if not records.content:
    return []
  ttc = time.time()-s
  rate = len(targets)/ttc
  if failures:
    LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures))
  LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)\t[%s]' % (ttc,len(targets),rate,h))

  records = ensureList(xmltodict.parse(records.__str__())['records']['record'])
  assert(len(records)==len(targets)-len(failures))

  # with open('%s.pickle' % uuid.uuid4(),'w') as fp:
  #   pickle.dump(records,fp)
  return records,targets
Example #2
0
def updateRecords(records,LOGGER=settings.LOGGER):

  if not records:
    LOGGER.debug("No records given")
    return []

  targets = dict(records)

  s = time.time()
  records = ADSRecords('full','XML')
  failures = []
  for bibcode in targets.keys():
    try:
      records.addCompleteRecord(bibcode)
    except KeyboardInterrupt:
      raise
    except:
      failures.append(bibcode)
      LOGGER.warning("[%s] ADSRecords failed" % bibcode)
  records = records.export()
  if not records.content:
    return []
  ttc = time.time()-s
  rate = len(targets)/ttc
  if failures:
    LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures))
  LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)' % (ttc,len(targets),rate))

  records = ensureList(xmltodict.parse(records.__str__())['records']['record'])
  with open('raw.txt','a') as fp:
    for r in records:
      fp.write('%s' % r)
      fp.write('\n\n')
  assert(len(records)==len(targets)-len(failures))

  #Could send these tasks out on a queue
  completeRecords = []
  for r in records:
    #Define top-level schema that will go in mongo
    cr = {
      'bibcode': r['@bibcode'],
      'JSON_fingerprint': targets[r['@bibcode']],
      'metadata' : {},
    }

    #Find metadata blocks that need merging
    metadataCounter = collections.Counter([entry['@type'] for entry in r['metadata']])
    needsMerging = dict([(k,[]) for k,v in metadataCounter.iteritems() if v>1])

    #Iterate over metadata blocks; directly input single defined blocks
    #and build a 'needsMerging' list to merge in the next step
    for metadataBlock in r['metadata']: 
      for field,data in metadataBlock.iteritems():
        if field in NORMALIZE_SCHEMA:
          metadataBlock[field] = NORMALIZE_SCHEMA[field](data)
      if metadataBlock['@type'] not in needsMerging:
        cr['metadata'].update({metadataBlock['@type']:metadataBlock})
      else: #If it shows up more than once, it needs merging.
        needsMerging[metadataBlock['@type']].append(metadataBlock)
    #Now merge the multiply defined metadataBlocks
    for entryType,data in needsMerging.iteritems():
      cr['metadata'].update({entryType:merge(data,r['@bibcode'],entryType,LOGGER)})
    
    #Finally, we have a complete record
    completeRecords.append(enforceSchema(cr))

  LOGGER.info('Added %s complete records' % len(completeRecords))
  return completeRecords