def makeEntityId(self, entityType, entity): entityIdTemplate = 'urn:schema-org:{0}:{{0}}:{{1}}'.format(entityType) if entityType == 'creative_work': return entityIdTemplate.format('url', helperfunctions.scrubUrl(entity['properties']['url'][0])) # AlignmentObject is unique. It is essentially a hyperedge and is defined by the combination of its properties. if entityType == 'alignment_object': propCollection = '' for prop in self.alignmentObjectProperties: if prop in entity['properties']: if isinstance(entity['properties'][prop], list) and \ len(entity['properties'][prop]) == 1 and \ (type(entity['properties'][prop][0]) == str or type(entity['properties'][prop][0]) == unicode): propCollection += entity['properties'][prop][0] else: raise Exception('Unexpected property in AlignmentObject. name: "{0}", value: "{1}"'.format(prop, entity['properties'][prop])) propCollection += '|' propCollection = propCollection[:-1] return entityIdTemplate.format('hash', hashlib.md5(propCollection).hexdigest()) ''' Don't ever trust the id field coming in from the JSON if 'id' in entity and len(entity['id']) > 0: # id is specific to stand-alone schema.org JSON, if it starts with "urn:" then # trust that it is properly formatted and globally unique if entity['id'].startswith('urn:'): return entity['id'] return entityIdTemplate.format('id', entity['id']) ''' for prop in self.potentialEntityIdProperties: if prop in entity['properties'] and \ isinstance(entity['properties'][prop], list) and \ len(entity['properties'][prop]) > 0 and \ len(entity['properties'][prop][0]) > 0: value = entity['properties'][prop][0] if prop.lower().endswith("url"): value = helperfunctions.scrubUrl(value) return entityIdTemplate.format(self.makeLowercaseUnderscore(prop), value) raise ValueError(str.format('unable to create entityId'))
def insertIntoElasticSearch(self, envelope): try: logging.debug(str.format('insertIntoElasticSearch with {0}', envelope['doc_ID'])) item = envelope['resource_data']['items'][0] es = pyes.ES('{0}:{1}'.format(self.config['lris']['host'], self.config['lris']['port'])) es.index(item, self.config['lris']['index'], self.config['lris']['index_type'], urllib.quote_plus(helperfunctions.scrubUrl(item['properties']['url'][0]))) except: logging.exception(str.format('doc_ID: {0}', envelope['doc_ID']))