def construct_traversals(root, node, visited, path): recurse = lambda neighbor: ( neighbor # no backtracking and neighbor not in visited and neighbor != node # no traveling THROUGH terminal nodes and (path[-1] not in terminal_nodes if path else neighbor.label not in terminal_nodes) and (not path[-1].startswith('_related') if path else not neighbor.label.startswith('_related'))) for edge in Edge._get_edges_with_src(node.__name__): neighbor = [ n for n in Node.get_subclasses() if n.__name__ == edge.__dst_class__ ][0] if recurse(neighbor): construct_traversals(root, neighbor, visited + [node], path + [edge.__src_dst_assoc__]) for edge in Edge._get_edges_with_dst(node.__name__): neighbor = [ n for n in Node.get_subclasses() if n.__name__ == edge.__src_class__ ][0] if recurse(neighbor): construct_traversals(root, neighbor, visited + [node], path + [edge.__dst_src_assoc__]) traversals[root][node.label] = traversals[root].get(node.label) or set() traversals[root][node.label].add('.'.join(path))
def construct_traversals(root, node, visited, path): recurse = lambda neighbor: ( neighbor # no backtracking and neighbor not in visited and neighbor != node # no traveling THROUGH terminal nodes and (path[-1] not in terminal_nodes if path else neighbor.label not in terminal_nodes) and (not path[-1].startswith('_related') if path else not neighbor.label.startswith('_related'))) for edge in Edge._get_edges_with_src(node.__name__): neighbor = [n for n in Node.get_subclasses() if n.__name__ == edge.__dst_class__][0] if recurse(neighbor): construct_traversals( root, neighbor, visited+[node], path+[edge.__src_dst_assoc__]) for edge in Edge._get_edges_with_dst(node.__name__): neighbor = [n for n in Node.get_subclasses() if n.__name__ == edge.__src_class__][0] if recurse(neighbor): construct_traversals( root, neighbor, visited+[node], path+[edge.__dst_src_assoc__]) traversals[root][node.label] = traversals[root].get(node.label) or set() traversals[root][node.label].add('.'.join(path))
def grant_graph_permissions(engine, roles, grant_users): for grant_user in grant_users: for cls in Node.get_subclasses() + Edge.get_subclasses(): stmt = "GRANT {roles} ON TABLE {table} TO {user};".format( roles=roles, table=cls.__tablename__, user=grant_user) print stmt.strip() engine.execute(text("BEGIN;" + stmt + "COMMIT;"))
def drop_all_tables(cls): for scls in Node.get_subclasses(): try: cls.engine.execute("DROP TABLE {} CASCADE".format( scls.__tablename__)) except Exception as e: cls.logger.warning(e)
def create_indexes(host, user, password, database): print("Creating indexes") engine = create_engine("postgres://{user}:{pwd}@{host}/{db}".format( user=user, host=host, pwd=password, db=database)) index = lambda t, c: ["CREATE INDEX ON {} ({})".format(t, x) for x in c] for scls in Node.get_subclasses(): tablename = scls.__tablename__ list(map(engine.execute, index(tablename, ["node_id"]))) list( map( engine.execute, [ "CREATE INDEX ON {} USING gin (_sysan)".format(tablename), "CREATE INDEX ON {} USING gin (_props)".format(tablename), "CREATE INDEX ON {} USING gin (_sysan, _props)".format( tablename), ], )) for scls in Edge.get_subclasses(): list( map( engine.execute, index(scls.__tablename__, ["src_id", "dst_id", "dst_id, src_id"]), ))
def down_transaction(connection): logger.info('Migrating async-transactions: down') for cls in Node.get_subclasses(): for index in get_secondary_key_indexes(cls): logger.info('Dropping %s', index.name) index.drop(connection) TX_LOG_PROJECT_ID_IDX.drop(connection)
def up_transaction(connection): logger.info('Migrating async-transactions: up') for cls in Node.get_subclasses(): for index in get_secondary_key_indexes(cls): logger.info('Creating %s', index.name) index.create(connection) TX_LOG_PROJECT_ID_IDX.create(connection)
def test_node_subclasses(client, submitter, pg_driver_clean, cgci_blgsp): post_example_entities_together(client, pg_driver_clean, submitter) for cls in Node.get_subclasses(): print cls data = json.dumps( {'query': """query Test {{ {} {{ id }}}}""".format(cls.label)}) r = client.post(path, headers=submitter, data=data) print r.data assert cls.label in r.json['data'], r.data
def export_to_csv(self, data_dir, silent=False): node_ids = dict() if not silent: i = 0 node_count = self.psqlgraphDriver.nodes().not_sysan({ 'to_delete': True }).count() print("Exporting {n} nodes:".format(n=node_count)) if node_count != 0: pbar = self.start_pbar(node_count) edge_file = open(os.path.join(data_dir, 'rels.csv'), 'w') print('start\tend\ttype\t', file=edge_file) self.create_node_files(data_dir) batch_size = 1000 id_count = 0 for node_type in Node.get_subclasses(): nodes = self.psqlgraphDriver.nodes(node_type).not_sysan({ 'to_delete': True }).yield_per(batch_size) for node in nodes: self.convert_node(node) self.node_to_csv(str(id_count), node) node_ids[node.node_id] = id_count id_count += 1 if not silent and node_count != 0: i = self.update_pbar(pbar, i) if not silent and node_count != 0: self.update_pbar(pbar, node_count) self.close_files() if not silent: i = 0 edge_count = self.psqlgraphDriver.get_edge_count() print("Exporting {n} edges:".format(n=edge_count)) if edge_count != 0: pbar = self.start_pbar(node_count) for edge_type in Edge.get_subclasses(): edges = self.psqlgraphDriver.edges(edge_type).yield_per(batch_size) for edge in edges: src = node_ids.get(edge.src_id, '') dst = node_ids.get(edge.dst_id, '') if src != '' and dst != '': edge_file.write( str(src) + '\t' + str(dst) + '\t' + edge.label + '\n') if not silent and edge_count != 0: i = self.update_pbar(pbar, i) edge_file.close() if not silent and edge_count != 0: self.update_pbar(pbar, edge_count)
def construct_traversals_from_node(root_node): node_subclasses = Node.get_subclasses() traversals = {node.label: set() for node in node_subclasses} def recursively_contstruct_traversals(node, visited, path): traversals[node.label].add('.'.join(path)) def should_recurse_on(neighbor): """Check whether to recurse on a path.""" return ( neighbor # no backtracking: and neighbor not in visited # No 0 length edges: and neighbor != node # Don't walk back up the tree: and is_valid_direction(root_node.label, node, visited, path) # no traveling THROUGH terminal nodes: and ( (path and path[-1] not in terminal_nodes) if path else neighbor.label not in terminal_nodes ) ) for edge in Edge._get_edges_with_src(node.__name__): neighbor_singleton = [ n for n in node_subclasses if n.__name__ == edge.__dst_class__ ] neighbor = neighbor_singleton[0] if should_recurse_on(neighbor): recursively_contstruct_traversals( neighbor, visited + [node], path + [edge.__src_dst_assoc__] ) for edge in Edge._get_edges_with_dst(node.__name__): neighbor_singleton = [ n for n in node_subclasses if n.__name__ == edge.__src_class__ ] neighbor = neighbor_singleton[0] if should_recurse_on(neighbor): recursively_contstruct_traversals( neighbor, visited + [node], path + [edge.__dst_src_assoc__] ) # Build up the traversals dictionary recursively. recursively_contstruct_traversals(root_node, [root_node], []) # Remove empty entries. traversals = { label: paths for label, paths in traversals.iteritems() if bool(paths) } return traversals
def tearDownClass(cls): """Recreate the database for tests that follow. """ cls.create_all_tables() # Re-grant permissions to test user for scls in Node.get_subclasses() + Edge.get_subclasses(): statment = ("GRANT ALL PRIVILEGES ON TABLE {} TO test".format( scls.__tablename__)) cls.engine.execute('BEGIN; %s; COMMIT;' % statment)
def make_graph_traversal_dict(app, preload=False): """Initialize the graph traversal dict. If USE_LAZY_TRAVERSE is False, Peregrine server will preload the full dict at start, or it will be initialized as an empty dict. You may call this method with `preload=True` to manually preload the full dict. """ app.graph_traversals = getattr(app, "graph_traversals", {}) if preload or not app.config.get("USE_LAZY_TRAVERSE", True): for node in Node.get_subclasses(): _get_paths_from(node, app)
def construct_traversals_from_node(root_node, app): traversals = {node.label: set() for node in Node.get_subclasses()} to_visit = [(root_node, [], [])] path = [] while to_visit: node, path, visited = to_visit.pop() if path: path_string = '.'.join(path) if path_string in traversals[node.label]: continue traversals[node.label].add(path_string) # stop at terminal nodes if path[-1] in terminal_nodes: continue # Don't walk back up the tree if not is_valid_direction(node, visited or [root_node]): continue name_to_subclass = getattr(app, 'name_to_subclass', None) if name_to_subclass is None: name_to_subclass = app.name_to_subclass = { n.__name__: n for n in Node.get_subclasses() } neighbors_dst = {(name_to_subclass[edge.__dst_class__], edge.__src_dst_assoc__) for edge in Edge._get_edges_with_src(node.__name__) if name_to_subclass[edge.__dst_class__]} neighbors_src = {(name_to_subclass[edge.__src_class__], edge.__dst_src_assoc__) for edge in Edge._get_edges_with_dst(node.__name__) if name_to_subclass[edge.__src_class__]} to_visit.extend([ (neighbor, path + [edge], visited + [node]) for neighbor, edge in neighbors_dst.union(neighbors_src) if neighbor not in visited ]) return { label: list(paths) for label, paths in traversals.iteritems() if paths }
def _queries(): return [ Query.schema( args=ns.NodeSubclassQuery.get_node_query_args(cls), name=NodeCountQuery._query_name(cls), type=graphene.Int, ) for cls in Node.get_subclasses() ] + [ Query.schema( args=transaction.TransactionLogQuery._args(), name="_{}_count".format(transaction.TransactionLogQuery.name), type=graphene.Int, ) ]
def inject_pg_edges(): """Add a dict of ALL the links, to and from, each class .. code-block:: { <link name>: {'backref': <backref name>, 'type': <target type> } } """ def find_backref(link, src_cls): """Given the JSON link definition and a source class :param:`src_cls`, return the name of the backref """ for prop, backref in link['dst_type']._pg_backrefs.iteritems(): if backref['src_type'] == cls: return prop def cls_inject_forward_edges(cls): """We should have already added the links that go OUT from this class, so let's add them to `_pg_edges` :returns: None, cls is mutated """ for name, link in cls._pg_links.iteritems(): cls._pg_edges[name] = { 'backref': find_backref(link, cls), 'type': link['dst_type'], } def cls_inject_backward_edges(cls): """We should have already added the links that go INTO this class, so let's add them to `_pg_edges` :returns: None, cls is mutated """ for name, backref in cls._pg_backrefs.iteritems(): cls._pg_edges[name] = { 'backref': backref['name'], 'type': backref['src_type'], } for cls in Node.get_subclasses(): cls_inject_forward_edges(cls) cls_inject_backward_edges(cls)
def export_to_csv(self, data_dir, silent=False): node_ids = dict() if not silent: i = 0 node_count = self.psqlgraphDriver.nodes().not_sysan({'to_delete': True}).count() print("Exporting {n} nodes:".format(n=node_count)) if node_count != 0: pbar = self.start_pbar(node_count) edge_file = open(os.path.join(data_dir, 'rels.csv'), 'w') print('start\tend\ttype\t', file=edge_file) self.create_node_files(data_dir) batch_size = 1000 id_count = 0 for node_type in Node.get_subclasses(): nodes = self.psqlgraphDriver.nodes(node_type).not_sysan({'to_delete': True}).yield_per(batch_size) for node in nodes: self.convert_node(node) self.node_to_csv(str(id_count), node) node_ids[node.node_id] = id_count id_count += 1 if not silent and node_count != 0: i = self.update_pbar(pbar, i) if not silent and node_count != 0: self.update_pbar(pbar, node_count) self.close_files() if not silent: i = 0 edge_count = self.psqlgraphDriver.get_edge_count() print("Exporting {n} edges:".format(n=edge_count)) if edge_count != 0: pbar = self.start_pbar(node_count) for edge_type in Edge.get_subclasses(): edges = self.psqlgraphDriver.edges(edge_type).yield_per(batch_size) for edge in edges: src = node_ids.get(edge.src_id, '') dst = node_ids.get(edge.dst_id, '') if src != '' and dst != '': edge_file.write(str(src)+'\t'+str(dst)+'\t'+edge.label+'\n') if not silent and edge_count != 0: i = self.update_pbar(pbar, i) edge_file.close() if not silent and edge_count != 0: self.update_pbar(pbar, edge_count)
def load_edges(): """Add a dictionry of links from this class { <link name>: {'backref': <backref name>, 'type': <source type> } } """ for src_label, subschema in dictionary.schema.iteritems(): src_cls = Node.get_subclass(src_label) if not src_cls: raise RuntimeError('No source class labeled {}'.format(src_label)) for name, link in get_links(subschema).iteritems(): edge_label = link['label'] edge_name = parse_edge( src_label, name, edge_label, subschema, link) src_cls._pg_links[link['name']] = { 'edge_out': edge_name, 'dst_type': Node.get_subclass(link['target_type']) } for src_cls in Node.get_subclasses(): cache_case = ( src_cls._dictionary['category'] in RELATED_CASES_CATEGORIES or src_cls.label in ['annotation'] ) if not cache_case: continue link = { 'name': RELATED_CASES_LINK_NAME, 'multiplicity': 'many_to_one', 'required': False, 'target_type': 'case', 'label': 'relates_to', 'backref': '_related_{}'.format(src_cls.label), } edge_name = parse_edge( src_cls.label, link['name'], 'relates_to', {'id': src_cls.label}, link, )
def load_edges(): """Add a dictionry of links from this class { <link name>: {'backref': <backref name>, 'type': <source type> } } """ for src_label, subschema in dictionary.schema.iteritems(): src_cls = Node.get_subclass(src_label) if not src_cls: raise RuntimeError('No source class labeled {}'.format(src_label)) for name, link in get_links(subschema).iteritems(): edge_label = link['label'] edge_name = parse_edge(src_label, name, edge_label, subschema, link) src_cls._pg_links[link['name']] = { 'edge_out': edge_name, 'dst_type': Node.get_subclass(link['target_type']) } for src_cls in Node.get_subclasses(): cache_case = (src_cls._dictionary['category'] in RELATED_CASES_CATEGORIES or src_cls.label in ['annotation']) if not cache_case: continue link = { 'name': RELATED_CASES_LINK_NAME, 'multiplicity': 'many_to_one', 'required': False, 'target_type': 'case', 'label': 'relates_to', 'backref': '_related_{}'.format(src_cls.label), } edge_name = parse_edge( src_cls.label, link['name'], 'relates_to', {'id': src_cls.label}, link, )
def update_case_cache_append_only(graph): """Server-side update case cache for all entities 1) Seed direct relationships from level L1 (1 step from case) 2) Visit all nodes in levels stepping out from case and for each entity in that level L, add the related case edges from all parents in level L-1 that do not already exist in level L """ cls_levels = get_levels() for cls in Node.get_subclasses(): seed_level_1(graph, cls) for level in sorted(cls_levels)[2:]: print("\n\nLevel:", level) for cls in cls_levels[level]: append_cache_from_parents(graph, cls)
def create_indexes(host, user, password, database): print('Creating indexes') engine = create_engine("postgres://{user}:{pwd}@{host}/{db}".format( user=user, host=host, pwd=password, db=database)) index = lambda t, c: ["CREATE INDEX ON {} ({})".format(t, x) for x in c] for scls in Node.get_subclasses(): tablename = scls.__tablename__ map(engine.execute, index( tablename, [ 'node_id', ])) map(engine.execute, [ "CREATE INDEX ON {} USING gin (_sysan)".format(tablename), "CREATE INDEX ON {} USING gin (_props)".format(tablename), "CREATE INDEX ON {} USING gin (_sysan, _props)".format(tablename), ]) for scls in Edge.get_subclasses(): map(engine.execute, index( scls.__tablename__, [ 'src_id', 'dst_id', 'dst_id, src_id', ]))
def create_indexes(host, port, user, password, database, use_ssl=False): print("Creating indexes") # added for Postgresql SSL connect_args = {} if use_ssl: connect_args["sslmode"] = "require" engine = create_engine( _get_connection_string(user=user, password=password, host=host, port=port, database=database), connect_args=connect_args, ) index = lambda t, c: ["CREATE INDEX ON {} ({})".format(t, x) for x in c] for scls in Node.get_subclasses(): tablename = scls.__tablename__ list(map(engine.execute, index(tablename, ["node_id"]))) list( map( engine.execute, [ "CREATE INDEX ON {} USING gin (_sysan)".format(tablename), "CREATE INDEX ON {} USING gin (_props)".format(tablename), "CREATE INDEX ON {} USING gin (_sysan, _props)".format( tablename), ], )) for scls in Edge.get_subclasses(): list( map( engine.execute, index(scls.__tablename__, ["src_id", "dst_id", "dst_id, src_id"]), ))
password='******') ``` """ import logging from sqlalchemy import not_, or_, and_ from psqlgraph import Node, PsqlGraphDriver from gdcdatamodel import models as md from multiprocessing import Process, cpu_count, Queue from collections import namedtuple CLS_WITH_PROJECT_ID = { cls for cls in Node.get_subclasses() if 'project_id' in cls.__pg_properties__ } CLS_WITH_STATE = { cls for cls in Node.get_subclasses() if 'state' in cls.__pg_properties__ } CLS_TO_UPDATE = CLS_WITH_PROJECT_ID & CLS_WITH_STATE # Determines state and file_state based on existing state STATE_MAP = { None: { 'state': 'submitted', 'file_state': None },
cache_related_cases_on_insert, cache_related_cases_on_delete, related_cases_from_cache, related_cases_from_parents, ) logger = get_logger('gdcdatamodel') # These are properties that are defined outside of the JSONB column in # the database, inform later code to skip these excluded_props = ['id', 'type'] # At module load time, evaluate which classes have already been # registered as subclasses of the abstract bases Node and Edge to # prevent double-registering loaded_nodes = [c.__name__ for c in Node.get_subclasses()] loaded_edges = [c.__name__ for c in Edge.get_subclasses()] def remove_spaces(s): """Returns a stripped string with all of the spaces removed. :param str s: String to remove spaces from """ return s.replace(' ', '') def register_class(cls): """Register a class in `globals`. This allows us to import the ORM classes from :mod:`gdcdatamodel.models`
path + [edge.__src_dst_assoc__]) for edge in Edge._get_edges_with_dst(node.__name__): neighbor = [ n for n in Node.get_subclasses() if n.__name__ == edge.__src_class__ ][0] if recurse(neighbor): construct_traversals(root, neighbor, visited + [node], path + [edge.__dst_src_assoc__]) traversals[root][node.label] = traversals[root].get(node.label) or set() traversals[root][node.label].add('.'.join(path)) for node in Node.get_subclasses(): traversals[node.label] = {} construct_traversals(node.label, node, [node], []) def union_subq_without_path(q, *args, **kwargs): return q.except_(union_subq_path(q, *args, **kwargs)) def union_subq_path(q, dst_label, post_filters=[]): src_label = q.entity().label if not traversals.get(src_label, {}).get(dst_label, {}): return q paths = list(traversals[src_label][dst_label]) base = q.subq_path(paths.pop(), post_filters) while paths:
if recurse(neighbor): construct_traversals( root, neighbor, visited+[node], path+[edge.__src_dst_assoc__]) for edge in Edge._get_edges_with_dst(node.__name__): neighbor = [n for n in Node.get_subclasses() if n.__name__ == edge.__src_class__][0] if recurse(neighbor): construct_traversals( root, neighbor, visited+[node], path+[edge.__dst_src_assoc__]) traversals[root][node.label] = traversals[root].get(node.label) or set() traversals[root][node.label].add('.'.join(path)) for node in Node.get_subclasses(): traversals[node.label] = {} construct_traversals(node.label, node, [node], []) def union_subq_without_path(q, *args, **kwargs): return q.except_(union_subq_path(q, *args, **kwargs)) def union_subq_path(q, dst_label, post_filters=[]): src_label = q.entity().label if not traversals.get(src_label, {}).get(dst_label, {}): return q paths = list(traversals[src_label][dst_label]) base = q.subq_path(paths.pop(), post_filters) while paths:
cache_related_cases_on_delete, related_cases_from_cache, related_cases_from_parents, ) logger = get_logger('gdcdatamodel') # These are properties that are defined outside of the JSONB column in # the database, inform later code to skip these excluded_props = ['id', 'type'] # At module load time, evaluate which classes have already been # registered as subclasses of the abstract bases Node and Edge to # prevent double-registering loaded_nodes = [c.__name__ for c in Node.get_subclasses()] loaded_edges = [c.__name__ for c in Edge.get_subclasses()] def remove_spaces(s): """Returns a stripped string with all of the spaces removed. :param str s: String to remove spaces from """ return s.replace(' ', '') def register_class(cls): """Register a class in `globals`. This allows us to import the ORM classes from :mod:`gdcdatamodel.models`
def construct_traversals_for_all_nodes(): for node in Node.get_subclasses(): traversals[node.label] = {} construct_traversals(node.label, node, [node], [])
def make_graph_traversal_dict(): return { node.label: construct_traversals_from_node(node) for node in Node.get_subclasses() }