Beispiel #1
0
def mergeRecords(records):
    completeRecords = []
    e = enforce_schema.Enforcer() # TODO: no need to create new instances?
    for r in copy.deepcopy(records):
        r['text'] = Merger().mergeText(r['text'])
        blocks = e.ensureList(r['metadata'])
        #Multiply defined blocks need merging.
        metadatablockCounter = collections.Counter([i['tempdata']['type'] for i in blocks])
        needsMerging = dict([(k,[]) for k,v in metadatablockCounter.iteritems() if v>1])
    
        completeMetadata = {}
        #First pass: Add the singly defined blocks to the complete record
        for b in blocks:
            _type = b['tempdata']['type']
            if _type not in needsMerging:
                completeMetadata[_type] = b
            else:
                needsMerging[_type].append(b)
    
    #Second pass: Merge the multiple defined blocks
    for _type,blocks in needsMerging.iteritems():
        m = Merger(blocks)
        m.merge()
        completeMetadata.update({
          _type: m.block,
        })
    
    #Finally, we have a complete record
    r['metadata'] = completeMetadata
    completeRecords.append(e.finalPassEnforceSchema(r))
    return completeRecords
Beispiel #2
0
 def __init__(self, 
              blocks=None, 
              logger=None, 
              merger_rules= _config['MERGER_RULES'],
              priorities = _config['PRIORITIES'],
              references_always_append = _config['REFERENCES_ALWAYS_APPEND']
              ):
   self.blocks = blocks
   self.logger=logger
   self.block = {}
   self.altpublications = []
   self.eL = enforce_schema.Enforcer().ensureList
   self.merger_rules = merger_rules
   self.priorities = priorities
   self.references_always_append = references_always_append
   
   if blocks:
     #Assert that there is only block type being merged
     assert len(set([i['tempdata']['type'] for i in blocks]))==1
     self.blocktype = blocks[0]['tempdata']['type']
   if not self.logger:
     self.logger = utils.setup_logging('merger')
Beispiel #3
0
def readRecordsFromPickles(records, files):
    '''
    records: [(bibcode,JSON_fingerprint),...]
    '''
    if not records:
        return []
    targets = dict(records)
    records = []

    for file_ in files:
        with open(file_) as fp:
            recs = pickle.load(fp)

    records.extend([r for r in recs if r['@bibcode'] in targets])

    e = enforce_schema.Enforcer()
    for r in records:
        r = e.enforceTopLevelSchema(record=r,
                                    JSON_fingerprint=targets[r['@bibcode']])
        r['metadata'] = e.enforceMetadataSchema(r['metadata'])
        #r['text'] = e.enforceTextSchema() TODO, once implemneted in ADSExports
    logger.info('readRecordsFromPickles: Read %s records from %s files' %
                (len(records), len(files)))
    return records
 def setUp(self):
     self.e = enforce_schema.Enforcer()
     self.general = self.e._generalEnforcer(stubdata.GENERAL)
     self.properties = self.e._propertiesEnforcer(stubdata.PROPERTIES)
     self.references = self.e._referencesEnforcer(stubdata.REFERENCES)
     self.relations = self.e._relationsEnforcer(stubdata.RELATIONS)
Beispiel #5
0
        except Exception, err:
            failures.append(bibcode)
            logger.warning('ADSExports failed: %s (%s)' % (bibcode, err))

    logger.debug("Calling ADSRecords.export()")
    adsrecords = adsrecords.export()
    logger.debug("...ADSRecords.export() returned.")
    if not adsrecords.content:
        logger.warning(
            'Recieved %s records, but ADSExports didn\'t return anything!' %
            len(records))
        return []
    ttc = time.time() - s
    rate = len(targets) / ttc

    e = enforce_schema.Enforcer()
    logger.debug("Calling xml_to_dict")
    try:
        json_dict = xml_to_dict(adsrecords)
        logger.debug("...xml_to_dict returned.")
        adsrecords = e.ensureList(json_dict['records']['record'])
    except timeout_decorator.timeout_decorator.TimeoutError:
        logger.warning("xml_to_dict timed while processing bibcodes: %s" %
                       '|'.join(bibcodes))
        failures.extend(bibcodes)
        adsrecords = []
    except xml.parsers.expat.ExpatError:
        logger.warning("XML parsing error while processing bibcodes: %s" %
                       '|'.join(bibcodes))
        failures.extend(bibcodes)
        adsrecords = []