def create_constraints(self): """Creates constraints on identifiers in Neo4j""" self.session.run('CREATE CONSTRAINT ON (x:Hot)' 'ASSERT x.`{}` IS UNIQUE'.format( Uri(Mapping, 'ori/identifier'))) self.session.run('CREATE CONSTRAINT ON (x:Live)' 'ASSERT x.`{}` IS UNIQUE'.format( Uri(Mapping, 'ori/sourceLocator')))
def enrich_item(self, item): if not ORI_CLASSIFIER_HOST or not ORI_CLASSIFIER_PORT: # Skip classifier if no host is specified return ori_classifier_url = 'http://{}:{}/classificeer'.format(ORI_CLASSIFIER_HOST, ORI_CLASSIFIER_PORT) if not hasattr(item, 'text'): return text = item.text if type(item.text) == list: text = ' '.join(text) if not text or len(text) < 76: return identifier_key = 'result' request_json = { 'ori_identifier': identifier_key, # not being used 'name': text } try: response = self.http_session.post(ori_classifier_url, json=request_json) response.raise_for_status() except requests.ConnectionError: # Return if no connection can be made log.warning('No connection to theme classifier') return response_json = response.json() theme_classifications = response_json.get(identifier_key, []) # Do not try this at home tags = { '@id': '%s#tags' % item.get_ori_identifier(), '@type': str(Uri(Rdf, 'Seq')) } i = 0 for name, value in sorted(theme_classifications.items(), key=operator.itemgetter(1), reverse=True): tag = { '@id': '%s#tags_%s' % (item.get_ori_identifier(), i), '@type': str(Uri(MeetingNS, 'TagHit')), str(Uri(MeetingNS, 'tag')): name, str(Uri(MeetingNS, 'score')): value, } tags[str(Uri(Rdf, '_%s' % i))] = tag i += 1 # No really, don't item.tags = tags
def generate_ori_identifier(self, iri): """ Generates a Resource with an ORI identifier and adds the IRI as a Source if it does not already exist. """ session = self.Session() new_id = self.engine.execute(Sequence('ori_id_seq')) new_identifier = Uri(Ori, new_id) try: # If the resource already exists, create the source as a child of the resource resource = session.query(Source).filter( Source.iri == iri).one().resource resource.sources.append(Source(iri=iri)) session.flush() except NoResultFound: # If the resource does not exist, create resource and source together resource = Resource(ori_id=new_id, iri=new_identifier, sources=[Source(iri=iri)]) session.add(resource) session.commit() finally: session.close() return new_identifier
def __init__(self, source_id=None, organization=None, source=None, source_id_key=None): # Set defaults #self.uri = None #self.prefix = None self.skip_validation = None # self.verbose_name = None self.values = dict() # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id> # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476 if source_id: assert organization assert source assert source_id_key self.had_primary_source = Uri( Mapping, '{}/{}/{}/{}'.format(organization, source, source_id_key, source_id)) self._source = source else: # Individuals also need a primary source or some queries will fail # As a solution the definition will be set as the primary source self.had_primary_source = self.absolute_uri()
def __init__(self, source_id=None, source=None, supplier=None, collection=None, merge_into=None): # Set defaults self.skip_validation = None self.values = dict() if merge_into: if not isinstance(merge_into, tuple) or len(merge_into) != 3: raise ValueError( 'merge_into requires a tuple with 3 elements: (predicate, column, value)' ) self.merge_into = merge_into else: self.merge_into = None # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id> # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476 if source_id: assert source assert supplier assert collection self.had_primary_source = Uri( Mapping, '{}/{}/{}/{}'.format(source, supplier, collection, slugify(source_id)))
def get_ori_identifier(self, iri): """ Retrieves a Resource-based ORI identifier from the database. If no corresponding Resource exists, a new one is created. """ session = self.Session() try: resource = session.query(Resource).join(Source).filter( Source.iri == iri).one() return Uri(Ori, resource.ori_id) except MultipleResultsFound: raise MultipleResultsFound('Multiple resources found for IRI %s' % iri) except NoResultFound: return self.generate_ori_identifier(iri=iri) finally: session.close()
def __init__(self, source_id=None, source=None, supplier=None, collection=None, merge_into=None, cached_path=None, canonical_iri=None): # Set defaults self.skip_validation = None self.values = dict() self.enricher_task = self.enricher_task if merge_into: if not isinstance(merge_into, tuple) or len(merge_into) != 3: raise ValueError( 'merge_into requires a tuple with 3 elements: (predicate, column, value)' ) self.merge_into = merge_into else: self.merge_into = None self.canonical_id = source_id self.cached_path = cached_path try: # if canonical_iri is a lambda function self.canonical_iri = canonical_iri(self) except TypeError: self.canonical_iri = canonical_iri # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id> # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476 if source_id: assert source assert supplier assert collection self.source_iri = Uri( Mapping, '{}/{}/{}/{}'.format(source, supplier, collection, slugify(source_id)))
class Neo4jDatabase(object): """Database implementation for Neo4j graph database. Provides methods for model operations to process ETL data for new and existing nodes. When the class is initialized, it reuses the driver if it has been used before. """ _driver = None HOT = 'Hot' COLD = 'Cold' ARCHIVE = 'Archive' default_params = { 'was_revision_of': cypher_escape(Uri(Prov, 'wasRevisionOf')), 'was_derived_from': cypher_escape(Uri(Prov, 'wasDerivedFrom')), 'had_primary_source': cypher_escape(Uri(Prov, 'hadPrimarySource')), 'provided_by': cypher_escape(Uri(Pav, 'providedBy')), 'ori_identifier': cypher_escape(Uri(Mapping, 'ori/identifier')), } def __init__(self, serializer): self.serializer = serializer if not self._driver: # Set driver on the class so all instances use the same driver type(self)._driver = GraphDatabase.driver(NEO4J_URL, auth=( NEO4J_USER, NEO4J_PASSWORD, )) self.session = self._driver.session() self.tx = None def query(self, query, **params): """Executes a query and returns the result""" cursor = self.session.run(query, **params) result = cursor.data() return result def transaction_query(self, query, **params): """Adds a query to be executed as a transaction. All queries called with this method will be in the same transaction until `transaction_commit` is called. """ if not self.tx: self.tx = self.session.begin_transaction() self.tx.run(query, **params) def transaction_commit(self): """Commits all queries that are added by `transaction_query`.""" if self.tx: result = self.tx.commit() self.tx = None # Make sure the tx is reset return result def create_constraints(self): """Creates constraints on identifiers in Neo4j""" self.session.run('CREATE CONSTRAINT ON (x:Hot)' 'ASSERT x.`{}` IS UNIQUE'.format( Uri(Mapping, 'ori/identifier'))) self.session.run('CREATE CONSTRAINT ON (x:Live)' 'ASSERT x.`{}` IS UNIQUE'.format( Uri(Mapping, 'ori/sourceLocator'))) def get_identifier_by_source_id(self, model_object, source_id): """Returns the ori identifier based on the specified source identifier. The ori identifier on a `Hot` node is queried by looking for the source identifier on `Cold` nodes. Should return exactly one int or a QueryResultError exception.""" fmt = AQuoteFormatter() label = self.serializer.label(model_object) params = { 'n1_labels': u':'.join([self.HOT, cypher_escape(label)]), 'n2_labels': u':'.join([self.COLD, cypher_escape(label)]), } params.update(self.default_params) clauses = [ u'MATCH (n2 :«n2_labels» {«had_primary_source»: $had_primary_source})<--(n1 :«n1_labels»)', u'RETURN n1.«ori_identifier» AS ori_identifier', ] result = self.query(fmt.format(u'\n'.join(clauses), **params), had_primary_source=source_id) if not result: raise MissingProperty('Does not exist') if len(result) > 1: raise QueryResultError( 'The number of results is greater than one!') return result[0]['ori_identifier'] def replace(self, model_object): """Replaces or creates nodes based on the model object. Existing nodes are replaced by the deflated model object and new ones are created when they do not exist. Three queries are run sequentially until one of them yields a result. The first will add a new version if an older version exists on a node, the second will add a new version when no older version exists, the third will create new nodes if the nodes do not yet exist. If the third query fails, an QueryResultError is raised. The first and second query will match the `Cold` node based on the source_id. """ fmt = AQuoteFormatter() label = self.serializer.label(model_object) n2_props = self.serializer.deflate(model_object, props=True, rels=False) params = { 'n1_labels': u':'.join([self.HOT, cypher_escape(label)]), 'n2_labels': u':'.join([self.COLD, cypher_escape(label)]), 'n3_labels': self.ARCHIVE, 'n4_labels': self.ARCHIVE, 'n5_labels': u':'.join( [self.ARCHIVE, cypher_escape(Uri(Prov, 'SoftwareAgent'))]), } params.update(self.default_params) if hasattr(model_object, '_source'): # Keep it readable # Expand labels # Same name variables # Escaping some variables # Parameters # Add a new version if an older version already exists clauses = [ u'MATCH (n1 :«n1_labels»)--(n2 :«n2_labels» {«had_primary_source»: $had_primary_source})-[r2 :«was_revision_of»]-(n3 :«n3_labels»)', u'MERGE (n2)-[:«was_revision_of»]->(n4 :«n4_labels»)-[:«was_revision_of»]->(n3)', u'MERGE (n2)-[:«provided_by»]->(n5 :«n5_labels» {name: $name})', u'SET n4 = n2', u'SET n2 = $n2_props', u'DELETE r2', ] cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), n2_props=n2_props, had_primary_source=model_object.had_primary_source, name=model_object._source, ) summary = cursor.summary() if summary.counters.relationships_deleted > 0: return # Add a new version if no older version exists clauses = [ u'MATCH (n1 :«n1_labels»)--(n2 :«n2_labels» {«had_primary_source»: $had_primary_source})', u'MERGE (n2)-[:«was_revision_of»]->(n4 :«n4_labels»)', u'MERGE (n2)-[:«provided_by»]->(n5 :«n5_labels» {name: $name})', u'SET n4 = n2', u'SET n2 = $n2_props', ] cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), n2_props=n2_props, had_primary_source=model_object.had_primary_source, name=model_object._source, ) summary = cursor.summary() if summary.counters.nodes_created > 0: return clauses = [ u'MATCH (n1 :«n1_labels» {«had_primary_source»: $had_primary_source})', u'RETURN n1', ] cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), had_primary_source=model_object.had_primary_source) n1_props = copy(n2_props) if len(cursor.data()) == 0: # n1_props = n2_props + ori_identifier n1_props[str(Uri(Mapping, 'ori/identifier'))] = \ model_object.generate_ori_identifier() # Create a new entity when no matching node seems to exist clauses = [ u'MERGE (n1 :«n1_labels» {«had_primary_source»: $had_primary_source})-[:«was_derived_from»]->(n2 :«n2_labels»)', ] bound_params = {} if hasattr(model_object, '_source'): clauses.extend([ u'MERGE (n5 :«n5_labels» {name: $name})', u'MERGE (n2)-[:«provided_by»]->(n5)', ]) bound_params['name'] = model_object._source clauses.extend([ u'SET n1 = $n1_props', u'SET n2 = $n2_props', u'RETURN n1.«ori_identifier» AS ori_identifier', ]) cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), n1_props=n1_props, n2_props=n2_props, had_primary_source=model_object.had_primary_source, **bound_params) result = cursor.data() if len(result) > 0: model_object.ori_identifier = result[0]['ori_identifier'] return raise QueryEmptyResult('No ori_identifier was returned') def attach(self, this_object, that_object, rel_type): """Attaches this_object to that_object model. The query will match the `Cold` node based on the source_id of the models. If available it will set `r1_props` on the relation between the nodes. """ from .model import Model, Relationship fmt = AQuoteFormatter() r1_props = dict() if isinstance(that_object, Relationship): r1_props = that_object.rel that_object = that_object.model if isinstance(r1_props, Model): r1_props = r1_props.serializer.deflate(props=True, rels=True) this_label = self.serializer.label(this_object) that_label = self.serializer.label(that_object) params = { 'n2_labels': u':'.join([self.COLD, cypher_escape(this_label)]), 'n3_labels': u':'.join([self.COLD, cypher_escape(that_label)]), 'r1_labels': cypher_escape(rel_type), } params.update(self.default_params) clauses = [ u'MATCH (n2 :«n2_labels» {«had_primary_source»: $had_primary_source1})', u'MATCH (n3 :«n3_labels» {«had_primary_source»: $had_primary_source2})', u'MERGE (n2)-[r1 :«r1_labels»]->(n3)', u'SET r1 = $r1_props', ] self.query(fmt.format(u'\n'.join(clauses), **params), had_primary_source1=this_object.had_primary_source, had_primary_source2=that_object.had_primary_source, r1_props=r1_props) def copy_relations(self): """Copies the relations from Cold->Cold nodes to Hot->Hot nodes. All relations between these nodes that do not already exist are copied. Only direct relations between `Cold` nodes are matched. """ fmt = AQuoteFormatter() params = { 'labels': self.COLD, 'n1_labels': self.HOT, 'n2_labels': self.COLD, 'n3_labels': self.HOT, } params.update(self.default_params) clauses = [ u'MATCH (n1 :«n1_labels»)-[:«was_derived_from»]->(n2 :«n2_labels»)-[r]->(:«labels»)<-[:«was_derived_from»]-(n3 :«n3_labels»)', u'WHERE NOT (n1)--(n3)', u'RETURN id(n1) AS id1, id(n2) as id2, id(n3) AS id3, type(r) AS rel, id(startNode(r)) AS start', ] for result in self.query(fmt.format(u'\n'.join(clauses), **params)): clauses = [ u'MATCH (n1), (n3)', u'WHERE id(n1) = $id1', u'AND id(n3) = $id3', u'MERGE (n1)-[:«rel»]->(n3)' ] self.query(fmt.format(u'\n'.join(clauses), rel=cypher_escape(result['rel']), **params), id1=result['id1'], id3=result['id3'])
def replace(self, model_object): """Replaces or creates nodes based on the model object. Existing nodes are replaced by the deflated model object and new ones are created when they do not exist. Three queries are run sequentially until one of them yields a result. The first will add a new version if an older version exists on a node, the second will add a new version when no older version exists, the third will create new nodes if the nodes do not yet exist. If the third query fails, an QueryResultError is raised. The first and second query will match the `Cold` node based on the source_id. """ fmt = AQuoteFormatter() label = self.serializer.label(model_object) n2_props = self.serializer.deflate(model_object, props=True, rels=False) params = { 'n1_labels': u':'.join([self.HOT, cypher_escape(label)]), 'n2_labels': u':'.join([self.COLD, cypher_escape(label)]), 'n3_labels': self.ARCHIVE, 'n4_labels': self.ARCHIVE, 'n5_labels': u':'.join( [self.ARCHIVE, cypher_escape(Uri(Prov, 'SoftwareAgent'))]), } params.update(self.default_params) if hasattr(model_object, '_source'): # Keep it readable # Expand labels # Same name variables # Escaping some variables # Parameters # Add a new version if an older version already exists clauses = [ u'MATCH (n1 :«n1_labels»)--(n2 :«n2_labels» {«had_primary_source»: $had_primary_source})-[r2 :«was_revision_of»]-(n3 :«n3_labels»)', u'MERGE (n2)-[:«was_revision_of»]->(n4 :«n4_labels»)-[:«was_revision_of»]->(n3)', u'MERGE (n2)-[:«provided_by»]->(n5 :«n5_labels» {name: $name})', u'SET n4 = n2', u'SET n2 = $n2_props', u'DELETE r2', ] cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), n2_props=n2_props, had_primary_source=model_object.had_primary_source, name=model_object._source, ) summary = cursor.summary() if summary.counters.relationships_deleted > 0: return # Add a new version if no older version exists clauses = [ u'MATCH (n1 :«n1_labels»)--(n2 :«n2_labels» {«had_primary_source»: $had_primary_source})', u'MERGE (n2)-[:«was_revision_of»]->(n4 :«n4_labels»)', u'MERGE (n2)-[:«provided_by»]->(n5 :«n5_labels» {name: $name})', u'SET n4 = n2', u'SET n2 = $n2_props', ] cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), n2_props=n2_props, had_primary_source=model_object.had_primary_source, name=model_object._source, ) summary = cursor.summary() if summary.counters.nodes_created > 0: return clauses = [ u'MATCH (n1 :«n1_labels» {«had_primary_source»: $had_primary_source})', u'RETURN n1', ] cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), had_primary_source=model_object.had_primary_source) n1_props = copy(n2_props) if len(cursor.data()) == 0: # n1_props = n2_props + ori_identifier n1_props[str(Uri(Mapping, 'ori/identifier'))] = \ model_object.generate_ori_identifier() # Create a new entity when no matching node seems to exist clauses = [ u'MERGE (n1 :«n1_labels» {«had_primary_source»: $had_primary_source})-[:«was_derived_from»]->(n2 :«n2_labels»)', ] bound_params = {} if hasattr(model_object, '_source'): clauses.extend([ u'MERGE (n5 :«n5_labels» {name: $name})', u'MERGE (n2)-[:«provided_by»]->(n5)', ]) bound_params['name'] = model_object._source clauses.extend([ u'SET n1 = $n1_props', u'SET n2 = $n2_props', u'RETURN n1.«ori_identifier» AS ori_identifier', ]) cursor = self.session.run( fmt.format(u'\n'.join(clauses), **params), n1_props=n1_props, n2_props=n2_props, had_primary_source=model_object.had_primary_source, **bound_params) result = cursor.data() if len(result) > 0: model_object.ori_identifier = result[0]['ori_identifier'] return raise QueryEmptyResult('No ori_identifier was returned')
def annotate_document(self, doc, municipality_code): # They're sets because we want to keep duplicates away municipal_refs = { 'districts': set(), 'neighborhoods': set(), } field_keys = [] if isinstance(doc, schema.MediaObject): field_keys = ['text'] if not field_keys: return doc for field_key in field_keys: text = getattr(doc, field_key, '') if type(text) == list: text = ' '.join(text) if not text: return clean_text = text.replace('-\n', '') if clean_text: setattr(doc, field_key, clean_text) else: continue url = '{}/annotate'.format(self.loclinkvis_url) try: resp = self.http_session.post(url, json={ 'municipality_code': municipality_code, 'text': clean_text }) except requests.ConnectionError: # Return if no connection can be made log.warning('No connection to LocLinkVis: %s' % url) return if not resp.ok: error_dict = { 'ori_identifier': doc.get_ori_identifier(), 'doc_type': type(doc), 'field_key': field_key, 'municipality_code': municipality_code, 'status_code': resp.status_code, 'time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } if resp.status_code == 500: error_dict['text'] = clean_text log.warning(error_dict) continue data = resp.json() if not data['districts'] and not data['neighborhoods']: # No annotations were found, continue continue municipal_refs['districts'].update(data['districts']) municipal_refs['neighborhoods'].update(data['neighborhoods']) doc.districts = list(municipal_refs.get('districts')) neighborhood_coordinates = list() for neighborhood in municipal_refs.get('neighborhoods', []): doc.neighborhoods = list(municipal_refs['neighborhoods']) url = '{}/municipal/{}'.format(self.loclinkvis_url, neighborhood) try: resp = self.http_session.get(url) resp.raise_for_status() except requests.ConnectionError: # Return if no connection can be made log.warning('No connection to LocLinkVis: %s' % url) continue json_response = resp.json() neighborhood_coordinates.append(json_response['geometry']['coordinates']) if neighborhood_coordinates: doc.neighborhood_polygons = { 'type': 'multipolygon', 'coordinates': neighborhood_coordinates, } polygons = list() for polygon in neighborhood_coordinates: pos_list = list() for coordinates in polygon[0]: pos_list.append({ str(Uri(Geo, 'lat')): coordinates[1], str(Uri(Geo, 'long')): coordinates[0], }) polygons.append({ '@type': str(Uri(NeoGeo, 'Polygon')), str(Uri(NeoGeo, 'exterior')): { '@type': str(Uri(NeoGeo, 'LinearRing')), str(Uri(NeoGeo, 'posList')): pos_list } }) doc.geometry = { '@type': str(Uri(NeoGeo, 'MultiPolygon')), str(Uri(NeoGeo, 'polygonMember')): polygons, } return doc
role = StringProperty(Opengov, 'role') voter = Relation(Schema, 'agent') vote_event = Relation(Opengov, 'voteEvent') option = Relation(Opengov, 'voteOption') weight = IntegerProperty(Opengov, 'weight') class ContactDetail(Opengov, owl.Thing): type = StringProperty(Rdf, 'type') value = StringProperty(Rdf, 'value') label = StringProperty(Rdfs, 'label') valid_from = DateTimeProperty(Schema, 'validFrom') note = StringProperty(Skos, 'note') valid_until = DateTimeProperty(Opengov, 'validUntil') class Result(Opengov, owl.Thing): text = StringProperty(Schema, 'text') vote_event = Relation(Opengov, 'voteEvent') ResultFailed = Uri(Opengov, "ResultFailed") ResultPassed = Uri(Opengov, "ResultPassed") VoteOptionYes = Uri(Opengov, "VoteOptionYes") VoteOptionNo = Uri(Opengov, "VoteOptionNo") VoteOptionAbstain = Uri(Opengov, "VoteOptionAbstain") VoteOptionAbsent = Uri(Opengov, "VoteOptionAbsent") VoteOptionNotVoting = Uri(Opengov, "VoteOptionNotVoting") VoteOptionPaired = Uri(Opengov, "VoteOptionPaired")
def generate_ori_identifier(self): self.ori_identifier = Uri( Ori, celery_app.backend.increment("ori_identifier_autoincrement")) return self.ori_identifier
motion = Relation(Opengov, 'motion') description = StringProperty(Schema, 'description') name = StringProperty(Schema, 'name', required=True) position = IntegerProperty(Schema, 'position') parent = Relation(Schema, 'superEvent', required=True) vote_event = Relation(Opengov, 'voteEvent') attendee = Relation(Schema, 'attendee') absentee = Relation(Schema, 'absentee') agenda = Relation(MeetingNS, 'agenda') last_discussed_at = DateTimeProperty(MeetingNS, 'lastDiscussedAt', ignore_for_loader=[ DeltaLoader, ]) class Committee(MeetingNS, org.Organization): pass ResultKept = Uri(MeetingNS, "ResultKept") ResultPostponed = Uri(MeetingNS, "ResultPostponed") ResultWithdrawn = Uri(MeetingNS, "ResultWithdrawn") ResultExpired = Uri(MeetingNS, "ResultExpired") ResultDiscussed = Uri(MeetingNS, "ResultDiscussed") ResultPublished = Uri(MeetingNS, "ResultPublished") EventCompleted = Uri(MeetingNS, "EventCompleted") EventConfirmed = Uri(MeetingNS, "EventConfirmed") EventUnconfirmed = Uri(MeetingNS, "EventUnconfirmed")
class ImageObject(Schema, owl.Thing): content_url = URLProperty(Schema, 'contentUrl') is_based_on = URLProperty(Schema, 'isBasedOn') file_format = StringProperty(Schema, 'fileFormat') content_size = StringProperty(Schema, 'contentSize') encoding_format = StringProperty(Schema, 'encodingFormat') exif_data = ArrayProperty(Schema, 'exifData') width = StringProperty(Schema, 'width') height = StringProperty(Schema, 'height') enricher_task = 'image_metadata' class PropertyValue(Schema, owl.Thing): name = StringProperty(Schema, 'name') value = StringProperty(Schema, 'value') class Place(Schema, owl.Thing): pass class VideoObject(Schema, owl.Thing): content_url = URLProperty(Schema, 'contentUrl') EventScheduled = Uri(Schema, "EventScheduled") EventRescheduled = Uri(Schema, "EventRescheduled") EventCancelled = Uri(Schema, "EventCancelled") EventPostponed = Uri(Schema, "EventPostponed")