def create_collection(self, graph): collection = Collection.create({ 'foreign_id': self.config.get('collection'), 'label': self.config.get('collection'), 'managed': True }) db.session.commit() coll_type = NodeType.get('Collection') return coll_type.merge(graph, name=collection.label, fingerprint=collection.foreign_id, alephCollection=collection.id)
def generate_paths(graph, entity, ignore_types=SKIP_TYPES): """Generate all possible paths which end in a different collection.""" Path.delete_by_entity(entity.id) if graph is None or entity.state != entity.STATE_ACTIVE: return log.info("Generating graph path cache: %r", entity) # TODO: should max path length be configurable? q = "MATCH pth = (start:Aleph:Entity)-[*1..3]-(end:Aleph:Entity) " \ "MATCH (start)-[startpart:PART_OF]->(startcoll:Collection) " \ "MATCH (end)-[endpart:PART_OF]->(endcoll:Collection) " \ "WHERE start.fingerprint = {entity_fp} AND " \ "startpart.alephCanonical = {entity_id} AND " \ "startcoll.alephCollection <> endcoll.alephCollection AND " \ "all(r IN relationships(pth) WHERE NOT type(r) IN {ignore_types}) " \ "WITH DISTINCT start, end, " \ " COLLECT(DISTINCT extract(x IN nodes(pth) | x.id)) AS paths, " \ " COLLECT(DISTINCT extract(x IN nodes(pth) | labels(x))) AS labels, " \ " COLLECT(DISTINCT extract(r IN relationships(pth) | type(r))) AS types, " \ " COLLECT(DISTINCT endcoll.alephCollection) AS end_collection_id " \ "RETURN start, end, paths, types, labels, end_collection_id " count = 0 for row in graph.run(q, entity_id=entity.id, entity_fp=entity.fingerprint, ignore_types=ignore_types): labels = unwind(row.get('labels')) labels = [l for l in labels if l != BASE_NODE] types = unwind(row.get('types')) if len(types) == 1 and 'AKA' in types: continue Path.from_data(entity, row.get('end_collection_id'), row.get('paths'), types, labels, NodeType.dict(row.get('start')), NodeType.dict(row.get('end'))) count += 1 db.session.commit() # TODO: send email to collection owners? log.info("Generated %s paths for %r", count, entity)
def load(self): """Generate query rows and load them into the graph.""" collection = Collection.create({ 'foreign_id': self.config.get('collection'), 'label': self.config.get('collection'), 'managed': True }) db.session.commit() graph = get_graph() coll_type = NodeType.get('Collection') collection = coll_type.merge(graph, name=collection.label, fingerprint=collection.foreign_id, alephCollection=collection.id) begin_time = time() rp = self.engine.execute(self.query) log.debug("Query time: %.5fms", (time() - begin_time) * 1000) stats = {'rows': 0, 'nodes': 0, 'rels': 0} while True: graphtx = graph.begin() rows = rp.fetchmany(10000) if not len(rows): break for row in rows: stats['rows'] += 1 self.update(graphtx, collection, dict(row.items()), stats) if stats['rows'] % 1000 == 0: elapsed = (time() - begin_time) stats['per_node'] = max(stats['nodes'], 1) / elapsed log.info("Loaded: %(rows)s [%(nodes)s nodes, " "%(rels)s edges], %(per_node).5f n/s", stats) graphtx.commit() log.info("Done. Loaded %(rows)s rows, %(nodes)s nodes, " "%(rels)s edges.", stats)
import logging from aleph.graph.nodes import NodeType from aleph.graph.edges import EdgeType log = logging.getLogger(__name__) EntityNode = NodeType('Entity', indices=['alephEntity']) PhoneNode = NodeType('Phone') EmailNode = NodeType('Email') AddressNode = NodeType('Address') CollectionNode = NodeType('Collection', indices=['alephCollection'], node=False) DocumentNode = NodeType('Document', indices=['alephDocument']) LOCATED_AT = EdgeType('LOCATED_AT') CONTACT_FOR = EdgeType('CONTACT_FOR') MENTIONS = EdgeType('MENTIONS') PART_OF = EdgeType('PART_OF', hidden=True) AKA = EdgeType('AKA', key='alephId')
import logging from aleph.graph.nodes import NodeType from aleph.graph.edges import EdgeType log = logging.getLogger(__name__) EntityNode = NodeType('Entity', key='alephEntity') PhoneNode = NodeType('Phone') EmailNode = NodeType('Email') AddressNode = NodeType('Address') CollectionNode = NodeType('Collection', key='alephCollection') DocumentNode = NodeType('Document', key='alephDocument') LOCATED_AT = EdgeType('LOCATED_AT') CONTACT_FOR = EdgeType('CONTACT_FOR') MENTIONS = EdgeType('MENTIONS') PART_OF = EdgeType('PART_OF', hidden=True) AKA = EdgeType('AKA', key='alephId')