def transform_wrapper( inputs: List[str], input_format: str, input_compression: str, output: str, output_format: str, output_compression: str, stream: bool, node_filters: Tuple, edge_filters: Tuple, transform_config: str, source: List, processes: int, ): """ Transform a Knowledge Graph from one serialization form to another. \f Parameters ---------- inputs: List[str] A list of files that contains nodes/edges input_format: str The input format input_compression: str The input compression type output: str The output file output_format: str The output format output_compression: str The output compression typ stream: bool Wheter or not to stream node_filters: Tuple[str, str] Node filters edge_filters: Tuple[str, str] Edge filters transform_config: str Transform config YAML source: List A list of source(s) to load from the YAML processes: int Number of processes to use """ transform( inputs, input_format, input_compression, output, output_format, output_compression, stream, node_filters, edge_filters, transform_config, source, processes=processes, )
def json2tsv(input, output) -> None: """ Converts an JSON file into 'nodes' and 'edges' TSV. :param input: Input file (JSON file). :param ouput: Output file name desired. :return: None. """ if input: if output is None: output = "data/nodes_and_edges/" transform( inputs=[input], input_format="obojson", output=output, output_format="tsv", ) else: input_folder = "data/input/" output_folder = "data/nodes_and_edges/" for subdir, dirs, files in os.walk(input_folder): for file in files: fn, ext = os.path.splitext(file) if ext == ".json": transform( inputs=[subdir + file], input_format="obojson", output=output_folder + fn, output_format="tsv", )
def test_transform_knowledge_source_rewrite_with_prefix(): """ Transform graph from TSV to JSON. """ inputs = [ os.path.join(RESOURCE_DIR, "graph_nodes.tsv"), os.path.join(RESOURCE_DIR, "graph_edges.tsv"), ] output = os.path.join(TARGET_DIR, "graph.json") knowledge_sources = [ ("aggregator_knowledge_source", "string,string database,new"), ("aggregator_knowledge_source", "go,gene ontology,latest"), ] transform( inputs=inputs, input_format="tsv", input_compression=None, output=output, output_format="json", output_compression=None, knowledge_sources=knowledge_sources, ) assert os.path.exists(output) data = json.load(open(output, "r")) assert "nodes" in data assert "edges" in data assert len(data["nodes"]) == 512 assert len(data["edges"]) == 531 for e in data["edges"]: if e["subject"] == "HGNC:10848" and e["object"] == "HGNC:20738": assert "aggregator_knowledge_source" in e assert "infores:new-string-database" in e["aggregator_knowledge_source"] if e["subject"] == "HGNC:10848" and e["object"] == "GO:0005576": assert "aggregator_knowledge_source" in e assert "infores:latest-gene-ontology" in e["aggregator_knowledge_source"]
def test_transform_knowledge_source_suppression(): """ Transform graph from TSV to JSON. """ inputs = [ os.path.join(RESOURCE_DIR, "graph_nodes.tsv"), os.path.join(RESOURCE_DIR, "graph_edges.tsv"), ] output = os.path.join(TARGET_DIR, "graph.json") knowledge_sources = [ ("aggregator_knowledge_source", "False"), ("knowledge_source", "False"), ] transform( inputs=inputs, input_format="tsv", input_compression=None, output=output, output_format="json", output_compression=None, knowledge_sources=knowledge_sources, ) assert os.path.exists(output) data = json.load(open(output, "r")) assert "nodes" in data assert "edges" in data assert len(data["nodes"]) == 512 assert len(data["edges"]) == 531 for e in data["edges"]: if e["subject"] == "HGNC:10848" and e["object"] == "HGNC:20738": assert "aggregator_knowledge_source" not in e assert "knowledge_source" not in e break
def test_transform2(): """ Transform from a test transform YAML. """ transform_config = os.path.join(RESOURCE_DIR, 'test-transform.yaml') transform(None, transform_config=transform_config) assert os.path.exists(os.path.join(RESOURCE_DIR, 'graph_nodes.tsv')) assert os.path.exists(os.path.join(RESOURCE_DIR, 'graph_edges.tsv'))
def test_transform2(): """ Transform from a test transform YAML. """ transform_config = os.path.join(RESOURCE_DIR, "test-transform.yaml") transform(inputs=None, transform_config=transform_config) assert os.path.exists(os.path.join(RESOURCE_DIR, "graph_nodes.tsv")) assert os.path.exists(os.path.join(RESOURCE_DIR, "graph_edges.tsv"))
def parse(self, name: str, data_file: str, source: str) -> None: """Processes the data_file. Args: name: Name of the ontology data_file: data file to parse source: Source name Returns: None. """ print(f"Parsing {data_file}") transform(inputs=[data_file], input_format='obojson', output= os.path.join(self.output_dir, name), output_format='tsv')
def test_transform_error(): """ Transform graph from TSV to JSON. """ inputs = [ os.path.join(RESOURCE_DIR, "graph_nodes.tsv"), os.path.join(RESOURCE_DIR, "graph_edges.tsv"), ] output = os.path.join(TARGET_DIR, "graph.json") knowledge_sources = [ ("aggregator_knowledge_source", "True"), ] try: { transform( transform_config="out.txt", inputs=inputs, input_format="tsv", input_compression=None, output=output, output_format="json", output_compression=None, knowledge_sources=knowledge_sources, ) } except ValueError: assert ValueError
def create_termlist(path: str, ont: str) -> None: """ Create termlist.tsv files from ontology JSON files for NLP TODO: Replace this code once runNER is installed and remove 'kg_microbe/utils/biohub_converter.py' """ ont_int = ont+'.json' json_input = os.path.join(path,ont_int) tsv_output = os.path.join(path,ont) transform(inputs=[json_input], input_format='obojson', output= tsv_output, output_format='tsv') ont_nodes = os.path.join(path, ont + '_nodes.tsv') ont_terms = os.path.abspath(os.path.join(os.path.dirname(json_input),'..','nlp/terms/', ont+'_termlist.tsv')) bc.parse(ont_nodes, ont_terms)
def parse(self, name: str, data_file: str, source: str) -> None: """Processes the data_file. Args: name: Name of the ontology data_file: data file to parse source: Source name Returns: None. """ print(f"Parsing {data_file}") compression: Optional[str] if data_file.endswith('.gz'): compression = 'gz' else: compression = None transform(inputs=[data_file], input_format='obojson', input_compression=compression, output=os.path.join(self.output_dir, name), output_format='tsv')
def test_transform1(): """ Transform graph from TSV to JSON. """ inputs = [ os.path.join(RESOURCE_DIR, 'graph_nodes.tsv'), os.path.join(RESOURCE_DIR, 'graph_edges.tsv'), ] output = os.path.join(TARGET_DIR, 'graph.json') transform( inputs=inputs, input_format='tsv', input_compression=None, output=output, output_format='json', output_compression=None, ) assert os.path.exists(output) data = json.load(open(output, 'r')) assert 'nodes' in data assert 'edges' in data assert len(data['nodes']) == 512 assert len(data['edges']) == 532
def transform_wrapper( inputs: List[str], input_format: str, input_compression: str, output: str, output_format: str, output_compression: str, stream: bool, node_filters: Optional[List[Tuple[str, str]]], edge_filters: Optional[List[Tuple[str, str]]], transform_config: str, source: Optional[List], knowledge_sources: Optional[List[Tuple[str, str]]], processes: int, infores_catalog: Optional[str] = None, ): """ Transform a Knowledge Graph from one serialization form to another. \f Parameters ---------- inputs: List[str] A list of files that contains nodes/edges input_format: str The input format input_compression: str The input compression type output: str The output file output_format: str The output format output_compression: str The output compression typ stream: bool Whether or not to stream node_filters: Optional[List[Tuple[str, str]]] Node input filters edge_filters: Optional[List[Tuple[str, str]]] Edge input filters transform_config: str Transform config YAML source: List A list of source(s) to load from the YAML knowledge_sources: Optional[List[Tuple[str, str]]] A list of named knowledge sources with (string, boolean or tuple rewrite) specification infores_catalog: Optional[str] Optional dump of a TSV file of InfoRes CURIE to Knowledge Source mappings processes: int Number of processes to use """ try: transform( inputs, input_format=input_format, input_compression=input_compression, output=output, output_format=output_format, output_compression=output_compression, stream=stream, node_filters=node_filters, edge_filters=edge_filters, transform_config=transform_config, source=source, knowledge_sources=knowledge_sources, processes=processes, infores_catalog=infores_catalog, ) exit(0) except Exception as te: get_logger().error(f"kgx.transform error: {str(te)}") exit(1)
def run(self, data_file: Optional[str] = None): """Method is called and performs needed transformations to process the trait data (NCBI/GTDB), additional information on this data can be found in the comment at the top of this script""" if data_file is None: data_file = self.source_name + ".csv" input_file = os.path.join( self.input_base_dir, data_file) # make directory in data/transformed os.makedirs(self.output_dir, exist_ok=True) """ Implement ROBOT """ # Convert OWL to JSON for CheBI Ontology convert_to_json(self.input_base_dir, 'CHEBI') """ Get information from the EnvironemtTransform """ environment_file = os.path.join(self.input_base_dir, 'environments.csv') env_df = pd.read_csv(environment_file, sep=',', low_memory=False, usecols=['Type', 'ENVO_terms', 'ENVO_ids']) unique_env_df = env_df.drop_duplicates() """ Create termlist.tsv files from ontology JSON files for NLP TODO: Replace this code once runNER is installed and remove 'project_name/utils/biohub_converter.py' """ ont = 'chebi' ont_int = ont+'.json' json_input = os.path.join(self.input_base_dir,ont_int) tsv_output = os.path.join(self.input_base_dir,ont) transform(inputs=[json_input], input_format='obojson', output= tsv_output, output_format='tsv') ont_nodes = os.path.join(self.input_base_dir, ont + '_nodes.tsv') ont_terms = os.path.abspath(os.path.join(os.path.dirname(json_input),'..','nlp/terms/', ont+'_termlist.tsv')) bc.parse(ont_nodes, ont_terms) """ NLP: Get 'chem_node_type' and 'org_to_chem_edge_label' """ if self.nlp: # Prep for NLP. Make sure the first column is the ID cols_for_nlp = ['tax_id', 'carbon_substrates'] input_file_name = prep_nlp_input(input_file, cols_for_nlp) # Set-up the settings.ini file for OGER and run create_settings_file(self.nlp_dir, 'CHEBI') oger_output = run_oger(self.nlp_dir, input_file_name, n_workers=5) #oger_output = process_oger_output(self.nlp_dir, input_file_name) # transform data, something like: with open(input_file, 'r') as f, \ open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ open(self.subset_terms_file, 'w') as terms_file: # write headers (change default node/edge headers if necessary node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(f.readline(), sep=',') seen_node: dict = defaultdict(int) seen_edge: dict = defaultdict(int) # Nodes org_node_type = "biolink:OrganismTaxon" # [org_name] chem_node_type = "biolink:ChemicalSubstance" # [carbon_substrate] shape_node_type = "biolink:AbstractEntity" # [cell_shape] #metabolism_node_type = "biolink:ActivityAndBehavior" # [metabolism] curie = 'NEED_CURIE' #Prefixes org_prefix = "NCBITaxon:" chem_prefix = "Carbon:" shape_prefix = "Shape:" #activity_prefix = "Metab:" source_prefix = "Env:" # Edges org_to_shape_edge_label = "biolink:has_phenotype" # [org_name -> cell_shape, metabolism] org_to_shape_edge_relation = "RO:0002200" # [org_name -> has phenotype -> cell_shape, metabolism] org_to_chem_edge_label = "biolink:interacts_with" # [org_name -> carbon_substrate] org_to_chem_edge_relation = "RO:0002438" # [org_name -> 'trophically interacts with' -> carbon_substrate] org_to_source_edge_label = "biolink:location_of" # [org -> isolation_source] org_to_source_edge_relation = "RO:0001015" #[org -> location_of -> source] # transform for line in f: """ This dataset is a csv and also has commas present within a column of data. Hence a regex solution """ # transform line into nodes and edges # node.write(this_node1) # node.write(this_node2) # edge.write(this_edge) line = re.sub(r'(?!(([^"]*"){2})*[^"]*$),', '|', line) # alanine, glucose -> alanine| glucose items_dict = parse_line(line, header_items, sep=',') org_name = items_dict['org_name'] tax_id = items_dict['tax_id'] metabolism = items_dict['metabolism'] carbon_substrates = set([x.strip() for x in items_dict['carbon_substrates'].split('|')]) cell_shape = items_dict['cell_shape'] isolation_source = set([x.strip() for x in items_dict['isolation_source'].split('|')]) # Write Node ['id', 'entity', 'category'] # Write organism node org_id = org_prefix + str(tax_id) if not org_id.endswith(':na') and org_id not in seen_node: write_node_edge_item(fh=node, header=self.node_header, data=[org_id, org_name, org_node_type, org_id]) seen_node[org_id] += 1 if org_id.startswith('NCBITaxon:'): terms_file.write(org_id + "\n") # Write chemical node for chem_name in carbon_substrates: chem_curie = curie #chem_node_type = chem_name # Get relevant NLP results if chem_name != 'NA': relevant_tax = oger_output.loc[oger_output['TaxId'] == int(tax_id)] relevant_chem = relevant_tax.loc[relevant_tax['TokenizedTerm'] == chem_name] if len(relevant_chem) == 1: chem_curie = relevant_chem.iloc[0]['CURIE'] chem_node_type = relevant_chem.iloc[0]['Biolink'] if chem_curie == curie: chem_id = chem_prefix + chem_name.lower().replace(' ','_') else: chem_id = chem_curie if not chem_id.endswith(':na') and chem_id not in seen_node: write_node_edge_item(fh=node, header=self.node_header, data=[chem_id, chem_name, chem_node_type, chem_curie]) seen_node[chem_id] += 1 # Write shape node shape_id = shape_prefix + cell_shape.lower() if not shape_id.endswith(':na') and shape_id not in seen_node: write_node_edge_item(fh=node, header=self.node_header, data=[shape_id, cell_shape, shape_node_type, curie]) seen_node[shape_id] += 1 # Write source node for source_name in isolation_source: # Collapse the entity # A_B_C_D => [A, B, C, D] # D is the entity of interest source_name_split = source_name.split('_') source_name_collapsed = source_name_split[-1] env_curie = curie env_term = source_name_collapsed source_node_type = "" # [isolation_source] left blank intentionally # Get information from the environments.csv (unique_env_df) relevant_env_df = unique_env_df.loc[unique_env_df['Type'] == source_name] if len(relevant_env_df) == 1: ''' If multiple ENVOs exist, take the last one since that would be the curie of interest after collapsing the entity. TODO(Maybe): If CURIE is 'nan', it could be sourced from OGER o/p (ENVO backend) of environments.csv ''' env_curie = str(relevant_env_df.iloc[0]['ENVO_ids']).split(',')[-1].strip() env_term = str(relevant_env_df.iloc[0]['ENVO_terms']).split(',')[-1].strip() if env_term == 'nan': env_curie = curie env_term = source_name_collapsed #source_id = source_prefix + source_name.lower() if env_curie == curie: source_id = source_prefix + source_name_collapsed.lower() else: source_id = env_curie if source_id.startswith('CHEBI:'): source_node_type = chem_node_type if not source_id.endswith(':na') and source_id not in seen_node: write_node_edge_item(fh=node, header=self.node_header, data=[source_id, env_term, source_node_type, env_curie]) seen_node[source_id] += 1 # Write Edge # org-chem edge if not chem_id.endswith(':na') and org_id+chem_id not in seen_edge: write_node_edge_item(fh=edge, header=self.edge_header, data=[org_id, org_to_chem_edge_label, chem_id, org_to_chem_edge_relation]) seen_edge[org_id+chem_id] += 1 # org-shape edge if not shape_id.endswith(':na') and org_id+shape_id not in seen_edge: write_node_edge_item(fh=edge, header=self.edge_header, data=[org_id, org_to_shape_edge_label, shape_id, org_to_shape_edge_relation]) seen_edge[org_id+shape_id] += 1 # org-source edge if not source_id.endswith(':na') and org_id+source_id not in seen_edge: write_node_edge_item(fh=edge, header=self.edge_header, data=[org_id, org_to_source_edge_label, source_id, org_to_source_edge_relation]) seen_edge[org_id+source_id] += 1 # Files write ends # Extract the 'cellular organismes' tree from NCBITaxon and convert to JSON ''' NCBITaxon_131567 = cellular organisms (Source = http://www.ontobee.org/ontology/NCBITaxon?iri=http://purl.obolibrary.org/obo/NCBITaxon_131567) ''' subset_ontology_needed = 'NCBITaxon' extract_convert_to_json(self.input_base_dir, subset_ontology_needed, self.subset_terms_file)