def main(input_path, output_path, biolink_model_only): """ Uses ontobio to load ontologies and choose the best biolink model term for a node category or edge label. """ input_transformer = get_transformer(get_type(input_path))() output_transformer = get_transformer(get_type(output_path))() input_transformer.parse(input_path) G = input_transformer.graph for n, data in G.nodes(data=True): if 'category' in data and isinstance(data['category'], (tuple, list, set)): for category in data['category']: if ':' in category: curie = make_curie(category) prefix, _ = curie.lower().rsplit(':', 1) ontologies[prefix] = None for u, v, data in G.edges(data=True): if 'edge_label' in data and ':' in data['edge_label']: curie = make_curie(data['edge_label']) prefix, _ = curie.lower().rsplit(':', 1) ontologies[prefix] = None print(ontologies) for key in ontologies.keys(): print(key) ontologies[key] = get_ontology(key) with click.progressbar(G.nodes(data=True)) as bar: for n, data in bar: if 'category' in data and isinstance(data['category'], (list, set, tuple)): l = [ get_term(make_curie(c), biolink_model_only) for c in data['category'] if ':' in c ] l += [c for c in data['category'] if ':' not in c] l = [x.replace('_', ' ') for x in l if x is not None] data['category'] = l elif 'category' not in data: data['category'] = ['named thing'] with click.progressbar(G.edges(data=True)) as bar: for u, v, data in bar: if 'edge_label' in data and ':' in data['edge_label']: c = make_curie(data['edge_label']) data['edge_label'] = get_term(c, biolink_model_only) data['valid_edge_label'] = bmt.get_predicate( data['edge_label']) is not None if 'edge_label' not in data or data['edge_label'] is None: data['edge_label'] = 'related_to' data['edge_label'] = data['edge_label'].replace(' ', '_') output_transformer.graph = G print('Saving to {}'.format(output_path)) output_transformer.save(output_path)
def validate(config: dict, path: str, output: str, output_dir: str, format: str): """ Run KGX validation on an input file to check for BioLink Model compliance. \f Parameters ---------- config: dict A dictionary containing the configuration for kgx.cli path: str Path to input file output: str Path to output file output_dir: Path to a directory format: The input format """ t = None if format: t = get_transformer(format)() else: t = get_transformer(get_type(path))() t.parse(path, input_format=format) validator = Validator() errors = validator.validate(t.graph) validator.write_report(errors, open(output, 'w'))
def build_transformer(path:str, input_type:str=None) -> Transformer: if input_type is None: input_type = get_type(path) constructor = get_transformer(input_type) if constructor is None: error('File does not have a recognized type: ' + str(get_file_types())) return constructor()
def load_transformer(input_paths:List[str], input_type:str=None) -> Transformer: """ Creates a transformer for the appropriate file type and loads the data into it from file. """ if input_type is None: input_types = [get_type(i) for i in input_paths] for t in input_types: if input_types[0] != t: error( """ Each input file must have the same file type. Try setting the --input-type parameter to enforce a single type. """ ) input_type = input_types[0] transformer_constructor = get_transformer(input_type) if transformer_constructor is None: error('Inputs do not have a recognized type: ' + str(get_file_types())) t = transformer_constructor() for i in input_paths: t.parse(i, input_type) t.report() return t
def transform_and_save(t:Transformer, output_path:str, output_type:str=None): """ Creates a transformer with the appropraite file type from the given transformer, and applies that new transformation and saves to file. """ if output_type is None: output_type = get_type(output_path) output_transformer = get_transformer(output_type) if output_transformer is None: error('Output does not have a recognized type: ' + str(get_file_types())) kwargs = { 'extention' : output_type } w = output_transformer(t.graph) result_path = w.save(output_path, **kwargs) if result_path is not None and os.path.isfile(result_path): click.echo("File created at: " + result_path) elif os.path.isfile(output_path): click.echo("File created at: " + output_path) else: error("Could not create file.")
def validate(config, path, input_type, output_dir, record_size): os.makedirs(output_dir, exist_ok=True) validator = Validator(record_size) t = get_transformer(get_type(path))() t.parse(path) # t = load_transformer(path, input_type) validator.validate(t.graph) for error_type, failures in validator.error_dict.items(): filename = error_type.replace(' ', '_') + '.log' with click.open_file(os.path.join(output_dir, filename), 'a+') as f: f.write('--- {} ---\n'.format(datetime.now())) for t in failures: if len(t) == 2: n, message = t if message is not None: f.write('node({}):\t{}\n'.format(n, message)) else: f.write('node({})\n'.format(n)) elif len(t) == 3: u, v, message = t if message is not None: f.write('edge({}, {}):\t{}\n'.format(u, v, message)) else: f.write('edge({}, {})\n'.format(u, v)) if validator.error_dict == {}: click.echo('No errors found') else: for key, value in validator.error_dict.items(): click.echo('{} - {}'.format(key, len(value)))
def merge(inputs, output): """ Loads a series of knowledge graphs and merges cliques using `same_as` edges as well as `same_as` node properties. The resulting graph will not have any `same_as` edges, and the remaining clique leader nodes will have all equivalent identifiers in their `same_as` property. """ transformers = [] output_transformer = get_transformer(get_type(output))() graph = None for path in inputs: construct = get_transformer(get_type(path)) if construct is None: raise Exception('No transformer for {}'.format(path)) transformers.append(construct()) for transformer, path in zip(transformers, inputs): if graph is None: graph = transformer.graph else: transformer.graph = graph transformer.parse(path) output_transformer.graph = graph output_transformer.graph = clique_merge(output_transformer.graph) output_transformer.save(output)
def validate(config, path, output, output_dir): t = get_transformer(get_type(path))() t.parse(path) validator = Validator() validator.validate(t.graph) time = datetime.now() if len(validator.errors) == 0: click.echo('No errors found') else: append_errors_to_file(output, validator.errors, time) if output_dir is not None: append_errors_to_files(output_dir, validator.errors, time)
def neo4j_download(config, page_size, stop_after, subject_label, object_label, edge_type, address, username, password, output, output_type): if not is_writable(output): try: with open(output, 'w+') as f: pass except: error(f'Cannot write to {output}') output_transformer = get_transformer(get_type(output))() G = output_transformer.graph driver = http_gdb(address, username=username, password=password) subject_label = ':`{}`'.format(subject_label) if isinstance(subject_label, str) else '' object_label = ':`{}`'.format(object_label) if isinstance(object_label, str) else '' edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else '' match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label) results = driver.query('{} return count(*)'.format(match)) click.echo('Using cyper query: {} return n, e, m'.format(match)) for a, in results: size = a break if size == 0: click.echo('No data available') quit() page_size = 1_000 skip_flag = False with click.progressbar(list(range(0, size, page_size)), label='Downloading {} many edges'.format(size)) as bar: for i in bar: q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size) results = driver.query(q) for n, e, m in results: subject_attr = n['data'] object_attr = m['data'] edge_attr = e['data'] if 'id' not in subject_attr or 'id' not in object_attr: if not skip_flag: click.echo('Skipping records that have no id attribute') skip_flag = True continue s = subject_attr['id'] o = object_attr['id'] if 'edge_label' not in edge_attr: edge_attr['edge_label'] = e['metadata']['type'] if 'category' not in subject_attr: subject_attr['category'] = n['metadata']['labels'] if 'category' not in object_attr: object_attr['category'] = m['metadata']['labels'] if s not in G: G.add_node(s, **subject_attr) if o not in G: G.add_node(o, **object_attr) G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr) if stop_after is not None and G.number_of_edges() > stop_after: break output_transformer.save(output)