def __init__(self, graph=None, host=None, ports=None, username=None, password=None, **args): super(NeoTransformer, self).__init__(graph) self.http_driver = None if ports is None: # read from config with open('config.yml', 'r') as ymlfile: cfg = yaml.load(ymlfile) if 'http_port' in cfg['neo4j']: http_uri = "http://{}:{}".format(cfg['neo4j']['host'], cfg['neo4j']['http_port']) logging.debug( "Initializing http driver with URI: {}".format( http_uri)) self.http_driver = http_gdb(http_uri, username=username, password=password) else: if 'http' in ports: http_uri = "http://{}:{}".format(host, ports['http']) self.http_driver = http_gdb(http_uri, username=username, password=password)
def __init__(self, graph: nx.MultiDiGraph = None, uri: str = None, username: str = None, password: str = None): """ Initialize an instance of NeoTransformer. """ super(NeoTransformer, self).__init__(graph) self.http_driver = None self.http_driver = http_gdb(uri, username=username, password=password)
def __init__(self, graph=None, host=None, port=None, username=None, password=None): super(NeoTransformer, self).__init__(graph) self.http_driver = None http_uri = f'http://{host}:{port}' self.http_driver = http_gdb(http_uri, username=username, password=password)
def test_neo_to_graph_download(): """ downloads a neo4j graph """ return subject_label = 'gene' object_label = None edge_type = None stop_after = 100 output_transformer = JsonTransformer() G = output_transformer.graph driver = http_gdb('http://localhost:7474', username='', password='') subject_label = ':`{}`'.format(subject_label) if isinstance( subject_label, str) else '' object_label = ':`{}`'.format(object_label) if isinstance( object_label, str) else '' edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else '' match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label) results = driver.query('{} return count(*)'.format(match)) print('Using cyper query: {} return n, e, m'.format(match)) for a, in results: size = a break if size == 0: print('No data available') quit() page_size = 1_000 skip_flag = False for i in range(0, size, page_size): q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size) results = driver.query(q) for n, e, m in results: subject_attr = n['data'] object_attr = m['data'] edge_attr = e['data'] if 'id' not in subject_attr or 'id' not in object_attr: if not skip_flag: print('Skipping records that have no id attribute') skip_flag = True continue s = subject_attr['id'] o = object_attr['id'] if 'edge_label' not in edge_attr: edge_attr['edge_label'] = e['metadata']['type'] if 'category' not in subject_attr: subject_attr['category'] = n['metadata']['labels'] if 'category' not in object_attr: object_attr['category'] = m['metadata']['labels'] if s not in G: G.add_node(s, **subject_attr) if o not in G: G.add_node(o, **object_attr) G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr) if stop_after is not None and G.number_of_edges() > stop_after: break
def neo4j_download(config: dict, address: str, username: str, password: str, output: str, output_type: str, subject_label: str, object_label: str, edge_label: str, directed: bool, page_size: int, stop_after: int): """ Download nodes and edges from Neo4j database. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli address: str The full HTTP address for Neo4j database username: str Username for authentication password: str Password for authentication output: str Where to write the output (stdout, by default) output_type: str The output type (``csv``, by default) subject_label: str The label for subject node in an association object_label: str The label for object node in an association edge_label: str The label for the edge in an association directed: bool Whether or not the edge is supposed to be directed (``true``, by default) stop_after: int The max number of edges to fetch page_size: int The page size to use while fetching associations from Neo4j (``10000``, by default) """ if not is_writable(output): try: with open(output, 'w+') as f: pass except: error(f'Cannot write to {output}') output_transformer = get_transformer(output_type)() G = output_transformer.graph driver = http_gdb(address, username=username, password=password) subject_label = ':`{}`'.format(subject_label) if isinstance( subject_label, str) else '' object_label = ':`{}`'.format(object_label) if isinstance( object_label, str) else '' edge_label = ':`{}`'.format(edge_label) if isinstance(edge_label, str) else '' if directed: query = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_label, object_label) else: query = 'match (n{})-[e{}]-(m{})'.format(subject_label, edge_label, object_label) results = driver.query('{} return count(*)'.format(query)) size = [x[0] for x in results][0] print("SIZE: {}".format(size)) if size == 0: click.echo('No records found.') return click.echo('Using cypher query: {} return n, e, m'.format(query)) page_size = 1_000 skip_flag = False with click.progressbar( list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar: for i in bar: q = '{} return n, e, m skip {} limit {}'.format( query, i, page_size) results = driver.query(q) for n, e, m in results: subject_attr = n['data'] object_attr = m['data'] edge_attr = e['data'] if 'id' not in subject_attr or 'id' not in object_attr: if not skip_flag: click.echo( 'Skipping records that have no id attribute') skip_flag = True continue s = subject_attr['id'] o = object_attr['id'] if 'edge_label' not in edge_attr: edge_attr['edge_label'] = e['metadata']['type'] if 'category' not in subject_attr: subject_attr['category'] = n['metadata']['labels'] if 'category' not in object_attr: object_attr['category'] = m['metadata']['labels'] if s not in G: G.add_node(s, **subject_attr) if o not in G: G.add_node(o, **object_attr) G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr) if stop_after is not None and G.number_of_edges() > stop_after: break output_transformer.save(output, extension=output_type)
def neo4j_edge_summary(config: dict, address: str, username: str, password: str, output: str = None): """ Get a summary of all the edges in a Neo4j database. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli address: str The full HTTP address for Neo4j database username: str Username for authentication password: str Password for authentication output: str Where to write the output (stdout, by default) """ if output is not None and not is_writable(output): error(f'Cannot write to {output}') http_driver = http_gdb(address, username=username, password=password) query = """ MATCH (x) RETURN DISTINCT x.category AS category """ records = http_driver.query(query) categories = set() for record in records: category = record[0] if isinstance(category, str): categories.add(category) elif isinstance(category, (list, set, tuple)): categories.update(category) elif category is None: continue else: error('Unrecognized value for node.category: {}'.format(category)) categories = list(categories) query = """ MATCH (n)-[r]-(m) WHERE (n.category = {category1} OR {category1} IN n.category) AND (m.category = {category2} OR {category2} IN m.category) RETURN DISTINCT {category1} AS subject_category, {category2} AS object_category, type(r) AS edge_type, split(n.id, ':')[0] AS subject_prefix, split(m.id, ':')[0] AS object_prefix, COUNT(*) AS frequency ORDER BY subject_category, object_category, frequency DESC; """ combinations = [(c1, c2) for c1 in categories for c2 in categories] rows = [] with click.progressbar(combinations, length=len(combinations)) as bar: for category1, category2 in bar: records = http_driver.query(query, params={ 'category1': category2, 'category2': category2 }) for r in records: rows.append({ 'subject_category': r[0], 'object_category': r[1], 'subject_prefix': r[3], 'object_prefix': r[4], 'frequency': r[5] }) df = pd.DataFrame(rows) df = df[[ 'subject_category', 'subject_prefix', 'object_category', 'object_prefix', 'frequency' ]] if output is None: with pd.option_context('display.max_rows', None, 'display.max_columns', None): click.echo(df) else: df.to_csv(output, sep='|', header=True) click.echo('Saved report to {}'.format(output))
def neo4j_node_summary(config: dict, address: str, username: str, password: str, output: str = None): """ Get a summary of all the nodes in a Neo4j database. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli address: str The full HTTP address for Neo4j database username: str Username for authentication password: str Password for authentication output: str Where to write the output (stdout, by default) """ if output is not None and not is_writable(output): error(f'Cannot write to {output}') http_driver = http_gdb(address, username=username, password=password) query = """ MATCH (x) RETURN DISTINCT x.category AS category """ records = http_driver.query(query) categories = set() for record in records: category = record[0] if isinstance(category, str): categories.add(category) elif isinstance(category, (list, set, tuple)): categories.update(category) elif category is None: continue else: error('Unrecognized value for node.category: {}'.format(category)) rows = [] with click.progressbar(categories, length=len(categories)) as bar: for category in bar: query = f""" MATCH (x) WHERE x.category = '{category}' OR '{category}' IN x.category RETURN DISTINCT '{category}' AS category, split(x.id, ':')[0] AS prefix, COUNT(*) AS frequency ORDER BY category, frequency DESC; """ records = http_driver.query(query) for record in records: rows.append({ 'category': record[0], 'prefix': record[1], 'frequency': record[2] }) df = pd.DataFrame(rows) df = df[['category', 'prefix', 'frequency']] if output is None: click.echo(df) else: df.to_csv(output, sep='|', header=True) click.echo('Saved report to {}'.format(output))
def neo4j_download(config, page_size, stop_after, subject_label, object_label, edge_type, address, username, password, output, output_type): if not is_writable(output): try: with open(output, 'w+') as f: pass except: error(f'Cannot write to {output}') output_transformer = get_transformer(get_type(output))() G = output_transformer.graph driver = http_gdb(address, username=username, password=password) subject_label = ':`{}`'.format(subject_label) if isinstance(subject_label, str) else '' object_label = ':`{}`'.format(object_label) if isinstance(object_label, str) else '' edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else '' match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label) results = driver.query('{} return count(*)'.format(match)) click.echo('Using cyper query: {} return n, e, m'.format(match)) for a, in results: size = a break if size == 0: click.echo('No data available') quit() page_size = 1_000 skip_flag = False with click.progressbar(list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar: for i in bar: q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size) results = driver.query(q) for n, e, m in results: subject_attr = n['data'] object_attr = m['data'] edge_attr = e['data'] if 'id' not in subject_attr or 'id' not in object_attr: if not skip_flag: click.echo('Skipping records that have no id attribute') skip_flag = True continue s = subject_attr['id'] o = object_attr['id'] if 'edge_label' not in edge_attr: edge_attr['edge_label'] = e['metadata']['type'] if 'category' not in subject_attr: subject_attr['category'] = n['metadata']['labels'] if 'category' not in object_attr: object_attr['category'] = m['metadata']['labels'] if s not in G: G.add_node(s, **subject_attr) if o not in G: G.add_node(o, **object_attr) G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr) if stop_after is not None and G.number_of_edges() > stop_after: break output_transformer.save(output)