Esempio n. 1
0
def transform_wrapper(
    inputs: List[str],
    input_format: str,
    input_compression: str,
    output: str,
    output_format: str,
    output_compression: str,
    stream: bool,
    node_filters: Tuple,
    edge_filters: Tuple,
    transform_config: str,
    source: List,
    processes: int,
):
    """
    Transform a Knowledge Graph from one serialization form to another.
    \f

    Parameters
    ----------
    inputs: List[str]
        A list of files that contains nodes/edges
    input_format: str
        The input format
    input_compression: str
        The input compression type
    output: str
        The output file
    output_format: str
        The output format
    output_compression: str
        The output compression typ
    stream: bool
        Wheter or not to stream
    node_filters: Tuple[str, str]
        Node filters
    edge_filters: Tuple[str, str]
        Edge filters
    transform_config: str
        Transform config YAML
    source: List
        A list of source(s) to load from the YAML
    processes: int
        Number of processes to use

    """
    transform(
        inputs,
        input_format,
        input_compression,
        output,
        output_format,
        output_compression,
        stream,
        node_filters,
        edge_filters,
        transform_config,
        source,
        processes=processes,
    )
Esempio n. 2
0
def json2tsv(input, output) -> None:
    """
    Converts an JSON file into 'nodes' and 'edges' TSV.

    :param input: Input file (JSON file).
    :param ouput: Output file name desired.
    :return: None.
    """
    if input:
        if output is None:
            output = "data/nodes_and_edges/"

        transform(
            inputs=[input],
            input_format="obojson",
            output=output,
            output_format="tsv",
        )
    else:
        input_folder = "data/input/"
        output_folder = "data/nodes_and_edges/"

        for subdir, dirs, files in os.walk(input_folder):
            for file in files:
                fn, ext = os.path.splitext(file)

                if ext == ".json":
                    transform(
                        inputs=[subdir + file],
                        input_format="obojson",
                        output=output_folder + fn,
                        output_format="tsv",
                    )
Esempio n. 3
0
def test_transform_knowledge_source_rewrite_with_prefix():
    """
    Transform graph from TSV to JSON.
    """
    inputs = [
        os.path.join(RESOURCE_DIR, "graph_nodes.tsv"),
        os.path.join(RESOURCE_DIR, "graph_edges.tsv"),
    ]
    output = os.path.join(TARGET_DIR, "graph.json")
    knowledge_sources = [
        ("aggregator_knowledge_source", "string,string database,new"),
        ("aggregator_knowledge_source", "go,gene ontology,latest"),
    ]
    transform(
        inputs=inputs,
        input_format="tsv",
        input_compression=None,
        output=output,
        output_format="json",
        output_compression=None,
        knowledge_sources=knowledge_sources,
    )
    assert os.path.exists(output)
    data = json.load(open(output, "r"))
    assert "nodes" in data
    assert "edges" in data
    assert len(data["nodes"]) == 512
    assert len(data["edges"]) == 531
    for e in data["edges"]:
        if e["subject"] == "HGNC:10848" and e["object"] == "HGNC:20738":
            assert "aggregator_knowledge_source" in e
            assert "infores:new-string-database" in e["aggregator_knowledge_source"]
        if e["subject"] == "HGNC:10848" and e["object"] == "GO:0005576":
            assert "aggregator_knowledge_source" in e
            assert "infores:latest-gene-ontology" in e["aggregator_knowledge_source"]
Esempio n. 4
0
def test_transform_knowledge_source_suppression():
    """
    Transform graph from TSV to JSON.
    """
    inputs = [
        os.path.join(RESOURCE_DIR, "graph_nodes.tsv"),
        os.path.join(RESOURCE_DIR, "graph_edges.tsv"),
    ]
    output = os.path.join(TARGET_DIR, "graph.json")
    knowledge_sources = [
        ("aggregator_knowledge_source", "False"),
        ("knowledge_source", "False"),
    ]
    transform(
        inputs=inputs,
        input_format="tsv",
        input_compression=None,
        output=output,
        output_format="json",
        output_compression=None,
        knowledge_sources=knowledge_sources,
    )
    assert os.path.exists(output)
    data = json.load(open(output, "r"))
    assert "nodes" in data
    assert "edges" in data
    assert len(data["nodes"]) == 512
    assert len(data["edges"]) == 531
    for e in data["edges"]:
        if e["subject"] == "HGNC:10848" and e["object"] == "HGNC:20738":
            assert "aggregator_knowledge_source" not in e
            assert "knowledge_source" not in e
            break
Esempio n. 5
0
def test_transform2():
    """
    Transform from a test transform YAML.
    """
    transform_config = os.path.join(RESOURCE_DIR, 'test-transform.yaml')
    transform(None, transform_config=transform_config)
    assert os.path.exists(os.path.join(RESOURCE_DIR, 'graph_nodes.tsv'))
    assert os.path.exists(os.path.join(RESOURCE_DIR, 'graph_edges.tsv'))
Esempio n. 6
0
def test_transform2():
    """
    Transform from a test transform YAML.
    """
    transform_config = os.path.join(RESOURCE_DIR, "test-transform.yaml")
    transform(inputs=None, transform_config=transform_config)
    assert os.path.exists(os.path.join(RESOURCE_DIR, "graph_nodes.tsv"))
    assert os.path.exists(os.path.join(RESOURCE_DIR, "graph_edges.tsv"))
Esempio n. 7
0
 def parse(self, name: str, data_file: str, source: str) -> None:
     """Processes the data_file.
     Args:
         name: Name of the ontology
         data_file: data file to parse
         source: Source name
     Returns:
          None.
     """
     print(f"Parsing {data_file}")
     
     transform(inputs=[data_file], input_format='obojson', output= os.path.join(self.output_dir, name), output_format='tsv')
Esempio n. 8
0
def test_transform_error():
    """
    Transform graph from TSV to JSON.
    """
    inputs = [
        os.path.join(RESOURCE_DIR, "graph_nodes.tsv"),
        os.path.join(RESOURCE_DIR, "graph_edges.tsv"),
    ]
    output = os.path.join(TARGET_DIR, "graph.json")
    knowledge_sources = [
        ("aggregator_knowledge_source", "True"),
    ]
    try: {
        transform(
            transform_config="out.txt",
            inputs=inputs,
            input_format="tsv",
            input_compression=None,
            output=output,
            output_format="json",
            output_compression=None,
            knowledge_sources=knowledge_sources,
        )
    }
    except ValueError:
        assert ValueError
Esempio n. 9
0
def create_termlist(path: str, ont: str) -> None:
        """
        Create termlist.tsv files from ontology JSON files for NLP

        TODO: Replace this code once runNER is installed and remove 'kg_microbe/utils/biohub_converter.py'
        """
        ont_int = ont+'.json'
        
        json_input = os.path.join(path,ont_int)
        tsv_output = os.path.join(path,ont)

        transform(inputs=[json_input], input_format='obojson', output= tsv_output, output_format='tsv')

        ont_nodes = os.path.join(path, ont + '_nodes.tsv')
        ont_terms = os.path.abspath(os.path.join(os.path.dirname(json_input),'..','nlp/terms/', ont+'_termlist.tsv'))
        bc.parse(ont_nodes, ont_terms)
    def parse(self, name: str, data_file: str, source: str) -> None:
        """Processes the data_file.
        Args:
            name: Name of the ontology
            data_file: data file to parse
            source: Source name
        Returns:
             None.
        """
        print(f"Parsing {data_file}")
        compression: Optional[str]
        if data_file.endswith('.gz'):
            compression = 'gz'
        else:
            compression = None

        transform(inputs=[data_file],
                  input_format='obojson',
                  input_compression=compression,
                  output=os.path.join(self.output_dir, name),
                  output_format='tsv')
Esempio n. 11
0
def test_transform1():
    """
    Transform graph from TSV to JSON.
    """
    inputs = [
        os.path.join(RESOURCE_DIR, 'graph_nodes.tsv'),
        os.path.join(RESOURCE_DIR, 'graph_edges.tsv'),
    ]
    output = os.path.join(TARGET_DIR, 'graph.json')
    transform(
        inputs=inputs,
        input_format='tsv',
        input_compression=None,
        output=output,
        output_format='json',
        output_compression=None,
    )
    assert os.path.exists(output)
    data = json.load(open(output, 'r'))
    assert 'nodes' in data
    assert 'edges' in data
    assert len(data['nodes']) == 512
    assert len(data['edges']) == 532
Esempio n. 12
0
def transform_wrapper(
    inputs: List[str],
    input_format: str,
    input_compression: str,
    output: str,
    output_format: str,
    output_compression: str,
    stream: bool,
    node_filters: Optional[List[Tuple[str, str]]],
    edge_filters: Optional[List[Tuple[str, str]]],
    transform_config: str,
    source: Optional[List],
    knowledge_sources: Optional[List[Tuple[str, str]]],
    processes: int,
    infores_catalog: Optional[str] = None,
):
    """
    Transform a Knowledge Graph from one serialization form to another.
    \f

    Parameters
    ----------
    inputs: List[str]
        A list of files that contains nodes/edges
    input_format: str
        The input format
    input_compression: str
        The input compression type
    output: str
        The output file
    output_format: str
        The output format
    output_compression: str
        The output compression typ
    stream: bool
        Whether or not to stream
    node_filters: Optional[List[Tuple[str, str]]]
        Node input filters
    edge_filters: Optional[List[Tuple[str, str]]]
        Edge input filters
    transform_config: str
        Transform config YAML
    source: List
        A list of source(s) to load from the YAML
    knowledge_sources: Optional[List[Tuple[str, str]]]
        A list of named knowledge sources with (string, boolean or tuple rewrite) specification
    infores_catalog: Optional[str]
        Optional dump of a TSV file of InfoRes CURIE to Knowledge Source mappings
    processes: int
        Number of processes to use

    """
    try:
        transform(
            inputs,
            input_format=input_format,
            input_compression=input_compression,
            output=output,
            output_format=output_format,
            output_compression=output_compression,
            stream=stream,
            node_filters=node_filters,
            edge_filters=edge_filters,
            transform_config=transform_config,
            source=source,
            knowledge_sources=knowledge_sources,
            processes=processes,
            infores_catalog=infores_catalog,
        )
        exit(0)
    except Exception as te:
        get_logger().error(f"kgx.transform error: {str(te)}")
        exit(1)
    def run(self, data_file: Optional[str] = None):
        """Method is called and performs needed transformations to process the 
        trait data (NCBI/GTDB), additional information on this data can be found in the comment 
        at the top of this script"""
        
        if data_file is None:
            data_file = self.source_name + ".csv"
        
        input_file = os.path.join(
            self.input_base_dir, data_file)

        # make directory in data/transformed
        os.makedirs(self.output_dir, exist_ok=True)

        """
        Implement ROBOT 
        """
        # Convert OWL to JSON for CheBI Ontology
        convert_to_json(self.input_base_dir, 'CHEBI')


        """
        Get information from the EnvironemtTransform
        """
        environment_file = os.path.join(self.input_base_dir, 'environments.csv')
        env_df = pd.read_csv(environment_file, sep=',', low_memory=False, usecols=['Type', 'ENVO_terms', 'ENVO_ids'])
        unique_env_df = env_df.drop_duplicates()



        """
        Create termlist.tsv files from ontology JSON files for NLP
        TODO: Replace this code once runNER is installed and remove 'project_name/utils/biohub_converter.py'
        """
        ont = 'chebi'
        ont_int = ont+'.json'
        
        json_input = os.path.join(self.input_base_dir,ont_int)
        tsv_output = os.path.join(self.input_base_dir,ont)

        transform(inputs=[json_input], input_format='obojson', output= tsv_output, output_format='tsv')

        ont_nodes = os.path.join(self.input_base_dir, ont + '_nodes.tsv')
        ont_terms = os.path.abspath(os.path.join(os.path.dirname(json_input),'..','nlp/terms/', ont+'_termlist.tsv'))
        bc.parse(ont_nodes, ont_terms)


        """
        NLP: Get 'chem_node_type' and 'org_to_chem_edge_label'
        """
        if self.nlp:
            # Prep for NLP. Make sure the first column is the ID
            cols_for_nlp = ['tax_id', 'carbon_substrates']
            input_file_name = prep_nlp_input(input_file, cols_for_nlp)
            # Set-up the settings.ini file for OGER and run
            create_settings_file(self.nlp_dir, 'CHEBI')
            oger_output = run_oger(self.nlp_dir, input_file_name, n_workers=5)
            #oger_output = process_oger_output(self.nlp_dir, input_file_name)

        # transform data, something like:
        with open(input_file, 'r') as f, \
                open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                open(self.subset_terms_file, 'w') as terms_file:

            # write headers (change default node/edge headers if necessary
            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")
            
            header_items = parse_header(f.readline(), sep=',')
            
            seen_node: dict = defaultdict(int)
            seen_edge: dict = defaultdict(int)


            # Nodes
            org_node_type = "biolink:OrganismTaxon" # [org_name]
            chem_node_type = "biolink:ChemicalSubstance" # [carbon_substrate]
            shape_node_type = "biolink:AbstractEntity" # [cell_shape]
            #metabolism_node_type = "biolink:ActivityAndBehavior" # [metabolism]
            curie = 'NEED_CURIE'
            
            #Prefixes
            org_prefix = "NCBITaxon:"
            chem_prefix = "Carbon:"
            shape_prefix = "Shape:"
            #activity_prefix = "Metab:"
            source_prefix = "Env:"

            # Edges
            org_to_shape_edge_label = "biolink:has_phenotype" #  [org_name -> cell_shape, metabolism]
            org_to_shape_edge_relation = "RO:0002200" #  [org_name -> has phenotype -> cell_shape, metabolism]
            org_to_chem_edge_label = "biolink:interacts_with" # [org_name -> carbon_substrate]
            org_to_chem_edge_relation = "RO:0002438" # [org_name -> 'trophically interacts with' -> carbon_substrate]
            org_to_source_edge_label = "biolink:location_of" # [org -> isolation_source]
            org_to_source_edge_relation = "RO:0001015" #[org -> location_of -> source]

            
            
            # transform
            for line in f:
                """
                This dataset is a csv and also has commas 
                present within a column of data. 
                Hence a regex solution
                """
                # transform line into nodes and edges
                # node.write(this_node1)
                # node.write(this_node2)
                # edge.write(this_edge)
                

                line = re.sub(r'(?!(([^"]*"){2})*[^"]*$),', '|', line) # alanine, glucose -> alanine| glucose
                items_dict = parse_line(line, header_items, sep=',')

                org_name = items_dict['org_name']
                tax_id = items_dict['tax_id']
                metabolism = items_dict['metabolism']
                carbon_substrates = set([x.strip() for x in items_dict['carbon_substrates'].split('|')])
                cell_shape = items_dict['cell_shape']
                isolation_source = set([x.strip() for x in items_dict['isolation_source'].split('|')])
                

            # Write Node ['id', 'entity', 'category']
                # Write organism node 
                org_id = org_prefix + str(tax_id)
                if not org_id.endswith(':na') and org_id not in seen_node:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[org_id,
                                               org_name,
                                               org_node_type,
                                               org_id])
                    seen_node[org_id] += 1
                    if org_id.startswith('NCBITaxon:'):
                        terms_file.write(org_id + "\n")

                # Write chemical node
                for chem_name in carbon_substrates:
                    chem_curie = curie
                    #chem_node_type = chem_name

                    # Get relevant NLP results
                    if chem_name != 'NA':
                        relevant_tax = oger_output.loc[oger_output['TaxId'] == int(tax_id)]
                        relevant_chem = relevant_tax.loc[relevant_tax['TokenizedTerm'] == chem_name]
                        if len(relevant_chem) == 1:
                            chem_curie = relevant_chem.iloc[0]['CURIE']
                            chem_node_type = relevant_chem.iloc[0]['Biolink']
                        

                    if chem_curie == curie:
                        chem_id = chem_prefix + chem_name.lower().replace(' ','_')
                    else:
                        chem_id = chem_curie

                    
                    if  not chem_id.endswith(':na') and  chem_id not in seen_node:
                        write_node_edge_item(fh=node,
                                            header=self.node_header,
                                            data=[chem_id,
                                                chem_name,
                                                chem_node_type,
                                                chem_curie])
                        seen_node[chem_id] += 1

                # Write shape node
                shape_id = shape_prefix + cell_shape.lower()
                if  not shape_id.endswith(':na') and shape_id not in seen_node:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[shape_id,
                                               cell_shape,
                                               shape_node_type,
                                               curie])
                    seen_node[shape_id] += 1

                # Write source node
                for source_name in isolation_source:
                    #   Collapse the entity
                    #   A_B_C_D => [A, B, C, D]
                    #   D is the entity of interest
                    source_name_split = source_name.split('_')
                    source_name_collapsed = source_name_split[-1]
                    env_curie = curie
                    env_term = source_name_collapsed
                    source_node_type = "" # [isolation_source] left blank intentionally

                    # Get information from the environments.csv (unique_env_df)
                    relevant_env_df = unique_env_df.loc[unique_env_df['Type'] == source_name]

                    if len(relevant_env_df) == 1:
                            '''
                            If multiple ENVOs exist, take the last one since that would be the curie of interest
                            after collapsing the entity.
                            TODO(Maybe): If CURIE is 'nan', it could be sourced from OGER o/p (ENVO backend)
                                  of environments.csv
                            '''
                            env_curie = str(relevant_env_df.iloc[0]['ENVO_ids']).split(',')[-1].strip()
                            env_term = str(relevant_env_df.iloc[0]['ENVO_terms']).split(',')[-1].strip()
                            if env_term == 'nan':
                                env_curie = curie
                                env_term = source_name_collapsed
                            
                                 

                    #source_id = source_prefix + source_name.lower()
                    if env_curie == curie:
                        source_id = source_prefix + source_name_collapsed.lower()
                    else:
                        source_id = env_curie
                        if source_id.startswith('CHEBI:'):
                            source_node_type = chem_node_type

                    if  not source_id.endswith(':na') and source_id not in seen_node:
                        write_node_edge_item(fh=node,
                                            header=self.node_header,
                                            data=[source_id,
                                                env_term,
                                                source_node_type,
                                                env_curie])
                        seen_node[source_id] += 1

                


            # Write Edge
                # org-chem edge
                if not chem_id.endswith(':na') and org_id+chem_id not in seen_edge:
                    write_node_edge_item(fh=edge,
                                            header=self.edge_header,
                                            data=[org_id,
                                                org_to_chem_edge_label,
                                                chem_id,
                                                org_to_chem_edge_relation])
                    seen_edge[org_id+chem_id] += 1

                # org-shape edge
                if  not shape_id.endswith(':na') and org_id+shape_id not in seen_edge:
                    write_node_edge_item(fh=edge,
                                            header=self.edge_header,
                                            data=[org_id,
                                                org_to_shape_edge_label,
                                                shape_id,
                                                org_to_shape_edge_relation])
                    seen_edge[org_id+shape_id] += 1
                
                # org-source edge
                if not source_id.endswith(':na') and org_id+source_id not in seen_edge:
                    write_node_edge_item(fh=edge,
                                            header=self.edge_header,
                                            data=[org_id,
                                                org_to_source_edge_label,
                                                source_id,
                                                org_to_source_edge_relation])
                    seen_edge[org_id+source_id] += 1
        # Files write ends

        # Extract the 'cellular organismes' tree from NCBITaxon and convert to JSON
        '''
        NCBITaxon_131567 = cellular organisms 
        (Source = http://www.ontobee.org/ontology/NCBITaxon?iri=http://purl.obolibrary.org/obo/NCBITaxon_131567)
        '''
        subset_ontology_needed = 'NCBITaxon'
        extract_convert_to_json(self.input_base_dir, subset_ontology_needed, self.subset_terms_file)