Ejemplo n.º 1
0
 def create_collection(self, graph):
     collection = Collection.create({
         'foreign_id': self.config.get('collection'),
         'label': self.config.get('collection'),
         'managed': True
     })
     db.session.commit()
     coll_type = NodeType.get('Collection')
     return coll_type.merge(graph, name=collection.label,
                            fingerprint=collection.foreign_id,
                            alephCollection=collection.id)
Ejemplo n.º 2
0
def generate_paths(graph, entity, ignore_types=SKIP_TYPES):
    """Generate all possible paths which end in a different collection."""
    Path.delete_by_entity(entity.id)
    if graph is None or entity.state != entity.STATE_ACTIVE:
        return
    log.info("Generating graph path cache: %r", entity)
    # TODO: should max path length be configurable?
    q = "MATCH pth = (start:Aleph:Entity)-[*1..3]-(end:Aleph:Entity) " \
        "MATCH (start)-[startpart:PART_OF]->(startcoll:Collection) " \
        "MATCH (end)-[endpart:PART_OF]->(endcoll:Collection) " \
        "WHERE start.fingerprint = {entity_fp} AND " \
        "startpart.alephCanonical = {entity_id} AND " \
        "startcoll.alephCollection <> endcoll.alephCollection AND " \
        "all(r IN relationships(pth) WHERE NOT type(r) IN {ignore_types}) " \
        "WITH DISTINCT start, end, " \
        " COLLECT(DISTINCT extract(x IN nodes(pth) | x.id)) AS paths, " \
        " COLLECT(DISTINCT extract(x IN nodes(pth) | labels(x))) AS labels, " \
        " COLLECT(DISTINCT extract(r IN relationships(pth) | type(r))) AS types, " \
        " COLLECT(DISTINCT endcoll.alephCollection) AS end_collection_id " \
        "RETURN start, end, paths, types, labels, end_collection_id "
    count = 0
    for row in graph.run(q,
                         entity_id=entity.id,
                         entity_fp=entity.fingerprint,
                         ignore_types=ignore_types):
        labels = unwind(row.get('labels'))
        labels = [l for l in labels if l != BASE_NODE]
        types = unwind(row.get('types'))
        if len(types) == 1 and 'AKA' in types:
            continue
        Path.from_data(entity, row.get('end_collection_id'), row.get('paths'),
                       types, labels, NodeType.dict(row.get('start')),
                       NodeType.dict(row.get('end')))
        count += 1
    db.session.commit()
    # TODO: send email to collection owners?
    log.info("Generated %s paths for %r", count, entity)
Ejemplo n.º 3
0
 def create_collection(self, graph):
     collection = Collection.create({
         'foreign_id':
         self.config.get('collection'),
         'label':
         self.config.get('collection'),
         'managed':
         True
     })
     db.session.commit()
     coll_type = NodeType.get('Collection')
     return coll_type.merge(graph,
                            name=collection.label,
                            fingerprint=collection.foreign_id,
                            alephCollection=collection.id)
Ejemplo n.º 4
0
    def load(self):
        """Generate query rows and load them into the graph."""
        collection = Collection.create({
            'foreign_id': self.config.get('collection'),
            'label': self.config.get('collection'),
            'managed': True
        })
        db.session.commit()

        graph = get_graph()
        coll_type = NodeType.get('Collection')
        collection = coll_type.merge(graph, name=collection.label,
                                     fingerprint=collection.foreign_id,
                                     alephCollection=collection.id)
        begin_time = time()
        rp = self.engine.execute(self.query)

        log.debug("Query time: %.5fms", (time() - begin_time) * 1000)
        stats = {'rows': 0, 'nodes': 0, 'rels': 0}
        while True:
            graphtx = graph.begin()
            rows = rp.fetchmany(10000)
            if not len(rows):
                break
            for row in rows:
                stats['rows'] += 1
                self.update(graphtx, collection, dict(row.items()), stats)

                if stats['rows'] % 1000 == 0:
                    elapsed = (time() - begin_time)
                    stats['per_node'] = max(stats['nodes'], 1) / elapsed
                    log.info("Loaded: %(rows)s [%(nodes)s nodes, "
                             "%(rels)s edges], %(per_node).5f n/s", stats)
            graphtx.commit()
        log.info("Done. Loaded %(rows)s rows, %(nodes)s nodes, "
                 "%(rels)s edges.", stats)
Ejemplo n.º 5
0
import logging

from aleph.graph.nodes import NodeType
from aleph.graph.edges import EdgeType

log = logging.getLogger(__name__)

EntityNode = NodeType('Entity', indices=['alephEntity'])
PhoneNode = NodeType('Phone')
EmailNode = NodeType('Email')
AddressNode = NodeType('Address')
CollectionNode = NodeType('Collection',
                          indices=['alephCollection'],
                          node=False)
DocumentNode = NodeType('Document', indices=['alephDocument'])

LOCATED_AT = EdgeType('LOCATED_AT')
CONTACT_FOR = EdgeType('CONTACT_FOR')
MENTIONS = EdgeType('MENTIONS')
PART_OF = EdgeType('PART_OF', hidden=True)
AKA = EdgeType('AKA', key='alephId')
Ejemplo n.º 6
0
import logging

from aleph.graph.nodes import NodeType
from aleph.graph.edges import EdgeType

log = logging.getLogger(__name__)

EntityNode = NodeType('Entity', key='alephEntity')
PhoneNode = NodeType('Phone')
EmailNode = NodeType('Email')
AddressNode = NodeType('Address')
CollectionNode = NodeType('Collection', key='alephCollection')
DocumentNode = NodeType('Document', key='alephDocument')

LOCATED_AT = EdgeType('LOCATED_AT')
CONTACT_FOR = EdgeType('CONTACT_FOR')
MENTIONS = EdgeType('MENTIONS')
PART_OF = EdgeType('PART_OF', hidden=True)
AKA = EdgeType('AKA', key='alephId')