Ejemplo n.º 1
0
    def make_pharmgkb_edge(self, fh: TextIO, line_data: dict) -> None:

        if set(self.edge_of_interest) != \
                set([line_data['Entity1_type'], line_data['Entity2_type']]):
            raise PharmGKBInvalidEdge(
                "Trying to make edge that's not an edge of interest")

        if line_data['Entity1_type'] == 'Gene':
            gene_id = line_data['Entity1_id']
            drug_id = line_data['Entity2_id']
        else:
            gene_id = line_data['Entity2_id']
            drug_id = line_data['Entity1_id']

        gene_id = self.get_uniprot_id(this_id=gene_id)

        evidence = line_data['Evidence']

        preferred_drug_id = self.make_preferred_drug_id(
            drug_id, self.drug_id_map)

        data = [
            preferred_drug_id, self.drug_gene_edge_label, gene_id,
            self.drug_gene_edge_relation, self.source_name,
            'biolink:Association', evidence
        ]

        write_node_edge_item(fh=fh, header=self.edge_header, data=data)
Ejemplo n.º 2
0
 def make_pharmgkb_chemical_node(self, fh: TextIO, chem_id: str, name: str,
                                 biolink_type: str) -> None:
     """Write out node for gene
     :param fh: file handle to write out gene
     :param id: pharmgkb gene id
     :param name: gene name
     :param biolink_type: biolink type for Chemical
     :return: None
     """
     preferred_drug_id = self.make_preferred_drug_id(
         chem_id, self.drug_id_map)
     data = [preferred_drug_id, name, biolink_type, self.source_name]
     write_node_edge_item(fh=fh, header=self.node_header, data=data)
Ejemplo n.º 3
0
 def make_pharmgkb_gene_node(self, fh: TextIO, this_id: str, name: str,
                             biolink_type: str) -> None:
     """Write out node for gene
     :param fh: file handle to write out gene
     :param this_id: pharmgkb gene id
     :param name: gene name
     :param biolink_type: biolink type for Gene
     (from make_gene_id_mapping_file())
     :return: None
     """
     gene_id = self.get_uniprot_id(this_id=this_id)
     data = [gene_id, name, biolink_type, self.source_name]
     write_node_edge_item(fh=fh, header=self.node_header, data=data)
Ejemplo n.º 4
0
    def run(self, data_file: Optional[str] = None):
        """Method to run transform to ingest data from IntAct for viral/human PPIs"""

        data_files = list()
        if not data_file:
            data_files.append(
                os.path.join(self.input_base_dir, 'intact_coronavirus.zip'))
        else:
            data_files.append(data_file)

        zip_file = data_files[0]

        # for tsv output:
        output_node_file = os.path.join(self.output_dir, 'nodes.tsv')
        output_edge_file = os.path.join(self.output_dir, 'edges.tsv')

        # make directory in data/transformed
        os.makedirs(self.output_dir, exist_ok=True)

        with open(output_node_file, 'w') as node, \
                open(output_edge_file, 'w') as edge:

            # write node.tsv header
            node.write('\t'.join(self.node_header) + '\n')
            edge.write('\t'.join(self.edge_header) + '\n')

            xml_tempdir = tempfile.mkdtemp()
            unzip_to_tempdir(zip_file, xml_tempdir)

            extracted_base_dir_list = os.listdir(xml_tempdir)
            file_path = os.path.join(xml_tempdir, extracted_base_dir_list[0])
            for file in os.listdir(file_path):
                if not fnmatch.fnmatch(file, '*.xml'):
                    logging.warning("Skipping non-xml file %s" % file)

                nodes_edges = self.parse_xml_to_nodes_edges(
                    os.path.join(file_path, file))

                # write out nodes
                for this_node in nodes_edges['nodes']:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=this_node)
                # write out edges
                for this_edge in nodes_edges['edges']:
                    write_node_edge_item(fh=edge,
                                         header=self.edge_header,
                                         data=this_edge)
Ejemplo n.º 5
0
    def run(self, data_file: Optional[str] = None):
        """Method is called and performs needed transformations to process the zhou host protein data, additional
        information on this data can be found in the comment at the top of this script."""

        input_file = os.path.join(self.input_base_dir, '41421_2020_153_MOESM1_ESM.pdf')

        pubmed_curie_prefix = 'PMID:'
        gene_curie_prefix = 'NCBIGene:'
        publication_node_type = 'biolink:Publication'
        gene_node_type = 'biolink:Gene'
        virus_node_type = 'biolink:OrganismalEntity'

        # list of RO interactions:
        # https://raw.githubusercontent.com/oborel/obo-relations/master/subsets/ro-interaction.owl
        host_gene_vgene_edge_label = 'biolink:interacts_with'
        host_gene_vgene_relation = 'RO:0002437'

        ncbitaxon_curie_prefix = 'NCBITaxon:'
        corona_info = {
            'IBV': {'taxon_id': 11120},
            'MHV': {'taxon_id': 502104},
            'HCoV-NL63': {'taxon_id': 277944},
            'HCoV-229E': {'taxon_id': 11137},
            'SARS': {'taxon_id': 227859},
            'MERS': {'taxon_id': 1335626},
        }

        # for tsv output:
        output_node_file = os.path.join(self.output_dir, 'nodes.tsv')
        output_edge_file = os.path.join(self.output_dir, 'edges.tsv')

        # make directory in data/transformed
        os.makedirs(self.output_dir, exist_ok=True)

        fig_3_table_unformatted = io.read_pdf(input_file,
                                              output_format='json',
                                              pages=[5, 6, 7],
                                              multiple_tables=True)

        fig_3_table = multi_page_table_to_list(fig_3_table_unformatted)

        with open(output_node_file, 'w') as node, open(output_edge_file, 'w') as edge:

            # write node.tsv header
            node.write('\t'.join(self.node_header) + '\n')
            edge.write('\t'.join(self.edge_header) + '\n')

            for row in fig_3_table:

                if row['Coronavirus'] not in corona_info:
                    raise Exception("Can't find info for coronavirus {}", row['Coronavirus'])

                this_corona_info = corona_info[row['Coronavirus']]
                corona_curie = ncbitaxon_curie_prefix + str(this_corona_info['taxon_id'])

                # WRITE NODES
                # virus
                write_node_edge_item(fh=node, header=self.node_header,
                                     data=[gene_curie_prefix + row['Host Gene ID'],
                                           row['Host Protein'],
                                           gene_node_type,
                                           self.source_name])

                # host gene
                write_node_edge_item(fh=node, header=self.node_header,
                                     data=[corona_curie,
                                           row['Coronavirus'],
                                           virus_node_type,
                                           self.source_name])

                # WRITE EDGES
                write_node_edge_item(fh=edge, header=self.edge_header,
                                     data=[
                                         gene_curie_prefix + row['Host Gene ID'],
                                         host_gene_vgene_edge_label,
                                         corona_curie,
                                         host_gene_vgene_relation,
                                         self.source_name,
                                         'biolink:Association',
                                         pubmed_curie_prefix + row['PubMed ID']
                                     ])

        return None
Ejemplo n.º 6
0
    def run(self, data_file: str = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = compress_json.local_load("node_header.json")
        edge_core_header = compress_json.local_load("edge_core_header.json")
        edge_additional_headers = compress_json.local_load(
            "edge_additional_headers.json")

        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen_proteins: Set = set()
        seen_genes: Set = set()

        # Required to align the node edge header of the gene
        # with the default header
        extra_header = [""] * (len(edge_additional_headers) + 1)

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                proteins = []
                for protein_name in ('protein1', 'protein2'):
                    protein = get_item_by_priority(items_dict, [protein_name])
                    protein = '.'.join(protein.split('.')[1:])
                    proteins.append(protein)
                    if protein in self.protein_gene_map:
                        gene = self.protein_gene_map[protein]
                        if gene not in seen_genes:
                            seen_genes.add(gene)
                            ensemble_gene = f"ENSEMBL:{gene}"
                            gene_informations = self.gene_info_map[
                                self.ensembl2ncbi_map[gene]]
                            write_node_edge_item(
                                fh=node,
                                header=self.node_header,
                                data=[
                                    ensemble_gene, gene_informations['symbol'],
                                    'biolink:Gene',
                                    gene_informations['description'],
                                    f"NCBIGene:{self.ensembl2ncbi_map[gene]}"
                                ])
                            write_node_edge_item(
                                fh=edge,
                                header=self.edge_header,
                                data=[
                                    ensemble_gene,
                                    "biolink:has_gene_product",
                                    protein,
                                    "RO:0002205",
                                    "NCBI",
                                ] + extra_header)

                        # write node data
                        if protein not in seen_proteins:
                            seen_proteins.add(protein)
                            write_node_edge_item(fh=node,
                                                 header=self.node_header,
                                                 data=[
                                                     f"ENSEMBL:{protein}", "",
                                                     protein_node_type, "", ""
                                                 ])

                # write edge data
                write_node_edge_item(
                    fh=edge,
                    header=self.edge_header,
                    data=[
                        proteins[0], edge_label, proteins[1], relation,
                        "STRING", items_dict['combined_score']
                    ] + [
                        items_dict.get(header, "")
                        for header in edge_additional_headers
                    ])
Ejemplo n.º 7
0
    def run(self, data_file: Optional[str] = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = compress_json.local_load("node_header.json")
        edge_core_header = compress_json.local_load("edge_core_header.json")
        edge_additional_headers = compress_json.local_load(
            "edge_additional_headers.json")

        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen_proteins: Set = set()
        seen_genes: Set = set()

        # Required to align the node edge header of the gene
        # with the default header
        self.extra_header = [""] * (len(edge_additional_headers) + 1)

        # make string ENSP to Uniprot id mapping dict
        string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
            os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                proteins = []
                for protein_name in ('protein1', 'protein2'):
                    nat_string_id = get_item_by_priority(
                        items_dict, [protein_name])
                    protein = '.'.join(nat_string_id.split('.')[1:])
                    proteins.append(protein)

                    if protein in self.protein_gene_map:
                        gene = self.protein_gene_map[protein]
                        if gene not in seen_genes:
                            seen_genes.add(gene)
                            ensemble_gene = f"ENSEMBL:{gene}"
                            gene_informations = self.gene_info_map[
                                self.ensembl2ncbi_map[gene]]
                            write_node_edge_item(
                                fh=node,
                                header=self.node_header,
                                data=[
                                    ensemble_gene, gene_informations['symbol'],
                                    'biolink:Gene',
                                    gene_informations['description'],
                                    f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
                                    self.source_name
                                ])
                            write_node_edge_item(
                                fh=edge,
                                header=self.edge_header,
                                data=[
                                    ensemble_gene, "biolink:has_gene_product",
                                    f"ENSEMBL:{protein}", "RO:0002205", "NCBI",
                                    ""
                                ] + self.extra_header)

                    # write node data
                    if protein not in seen_proteins:
                        seen_proteins.add(protein)

                        # if we have an equivalent Uniprot ID for this Ensembl protein
                        # ID make an xref edge, and a node for the Uniprot ID
                        uniprot_curie = ''
                        if protein in string_to_uniprot_id_map:
                            uniprot_curie = \
                                f"UniProtKB:{string_to_uniprot_id_map[protein]}"
                            uniprot_curie = collapse_uniprot_curie(
                                uniprot_curie)

                        write_node_edge_item(
                            fh=node,
                            header=self.node_header,
                            data=[
                                f"ENSEMBL:{protein}",
                                "",
                                protein_node_type,
                                "",
                                uniprot_curie,  # xref
                                self.source_name
                            ])

                # write edge data
                write_node_edge_item(
                    fh=edge,
                    header=self.edge_header,
                    data=[
                        f"ENSEMBL:{proteins[0]}", edge_label,
                        f"ENSEMBL:{proteins[1]}", relation, "STRING",
                        "biolink:Association", items_dict['combined_score']
                    ] + [
                        items_dict.get(header, "")
                        for header in edge_additional_headers
                    ])
Ejemplo n.º 8
0
    def run(self, data_file: str = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = ['id', 'name', 'category', 'description', 'alias']
        edge_core_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'combined_score'
        ]
        edge_additional_headers = [
            'neighborhood', 'neighborhood_transferred', 'fusion',
            'cooccurence', 'homology', 'coexpression',
            'coexpression_transferred', 'experiments',
            'experiments_transferred', 'database', 'database_transferred',
            'textmining', 'textmining_transferred'
        ]
        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen: List = []

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                protein1 = get_item_by_priority(items_dict, ['protein1'])
                protein1 = '.'.join(protein1.split('.')[1:])
                if protein1 in self.protein_gene_map:
                    gene1 = self.protein_gene_map[protein1]
                else:
                    gene1 = None
                protein2 = get_item_by_priority(items_dict, ['protein2'])
                protein2 = '.'.join(protein2.split('.')[1:])
                if protein2 in self.protein_gene_map:
                    gene2 = self.protein_gene_map[protein2]
                else:
                    gene2 = None

                if gene1 and gene1 not in seen:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            f"ENSEMBL:{gene1}", self.gene_info_map[
                                self.ensembl2ncbi_map[gene1]]['symbol'],
                            'biolink:Gene', self.gene_info_map[
                                self.ensembl2ncbi_map[gene1]]['description'],
                            f"NCBIGene:{self.ensembl2ncbi_map[gene1]}"
                        ])
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            f"ENSEMBL:{gene1}", "biolink:has_gene_product",
                            protein1, "RO:0002205", "NCBI", ""
                        ] + ["" for x in edge_additional_headers])
                    seen.append(gene1)

                if gene2 and gene2 not in seen:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            f"ENSEMBL:{gene2}", self.gene_info_map[
                                self.ensembl2ncbi_map[gene2]]['symbol'],
                            'biolink:Gene', self.gene_info_map[
                                self.ensembl2ncbi_map[gene2]]['description'],
                            f"NCBIGene:{self.ensembl2ncbi_map[gene2]}"
                        ])
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            f"ENSEMBL:{gene2}", "biolink:has_gene_product",
                            protein2, "RO:0002205", "NCBI", ""
                        ] + ["" for x in edge_additional_headers])
                    seen.append(gene2)

                # write node data
                if protein1 not in seen:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[
                                             f"ENSEMBL:{protein1}", "",
                                             protein_node_type, "", ""
                                         ])

                if protein2 not in seen:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[
                                             f"ENSEMBL:{protein2}", "",
                                             protein_node_type, "", ""
                                         ])
                seen.append(protein1)
                seen.append(protein2)

                # write edge data
                edge_data = [
                    protein1, edge_label, protein2, relation, "STRING",
                    items_dict['combined_score']
                ]
                for x in edge_additional_headers:
                    edge_data.append(items_dict[x] if x in items_dict else "")

                write_node_edge_item(fh=edge,
                                     header=self.edge_header,
                                     data=edge_data)
Ejemplo n.º 9
0
    def run(self,
            data_file: Optional[str] = None,
            species: str = "H**o sapiens") -> None:
        """Method is called and performs needed transformations to process the Drug
        Central data, additional information
        on this data can be found in the comment at the top of this script"""

        interactions_file = os.path.join(self.input_base_dir,
                                         "drug.target.interaction.tsv.gz")
        tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip")
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        gene_curie_prefix = "UniProtKB:"
        drug_curie_prefix = "DrugCentral:"
        gene_node_type = "biolink:Gene"
        drug_gene_edge_label = "biolink:interacts_with"
        drug_gene_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'comment'
        ]

        # unzip tcrd.zip and get tchem and tclin filenames
        tempdir = tempfile.mkdtemp()
        (tclin_file,
         tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir)

        tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot')
        tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot')

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                if 'ORGANISM' not in items_dict or items_dict[
                        'ORGANISM'] != species:
                    continue

                # get gene ID
                try:
                    gene_id_string = get_item_by_priority(
                        items_dict, ['ACCESSION'])
                    gene_ids = gene_id_string.split('|')
                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    continue

                # get drug ID
                drug_id = drug_curie_prefix + get_item_by_priority(
                    items_dict, ['STRUCT_ID'])

                # WRITE NODES
                # drug - ['id', 'name', 'category']
                write_node_edge_item(fh=node,
                                     header=self.node_header,
                                     data=[
                                         drug_id, items_dict['DRUG_NAME'],
                                         drug_node_type,
                                         str(False),
                                         str(False)
                                     ])

                for gene_id in gene_ids:
                    gene_id = gene_curie_prefix + gene_id
                    is_tclin = True if gene_ids[0] in tclin_dict else False
                    is_tchem = True if gene_ids[0] in tchem_dict else False

                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[
                                             gene_id, items_dict['GENE'],
                                             gene_node_type,
                                             str(is_tclin),
                                             str(is_tchem)
                                         ])

                    # WRITE EDGES
                    # ['subject', 'edge_label', 'object', 'relation', 'provided_by',
                    # 'comment']
                    write_node_edge_item(fh=edge,
                                         header=self.edge_header,
                                         data=[
                                             drug_id, drug_gene_edge_label,
                                             gene_id, drug_gene_edge_relation,
                                             self.source_name,
                                             items_dict['ACT_COMMENT']
                                         ])

        return None
Ejemplo n.º 10
0
    def run(self,
            data_file: Optional[str] = None,
            species: str = "H**o sapiens") -> None:
        """Method is called and performs needed transformations to process the Drug
        Central data, additional information
        on this data can be found in the comment at the top of this script"""

        if data_file is None:
            data_file = "drug.target.interaction.tsv.gz"
        interactions_file = os.path.join(self.input_base_dir, data_file)
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        uniprot_curie_prefix = "UniProtKB:"
        drug_curie_prefix = "DrugCentral:"
        protein_node_type = "biolink:Protein"
        drug_protein_edge_label = "biolink:molecularly_interacts_with"
        drug_protein_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'comment', 'type'
        ]

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            seen_proteins: dict = defaultdict(int)
            seen_drugs: dict = defaultdict(int)

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                if 'ORGANISM' not in items_dict or items_dict[
                        'ORGANISM'] != species:
                    continue

                # get protein ID
                try:
                    protein_dict = items_dict_to_protein_data_dict(items_dict)

                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    continue
                except ValueError:
                    logging.error("Value error while parsing line")
                    continue

                # get drug ID
                drug_id = drug_curie_prefix + get_item_by_priority(
                    items_dict, ['STRUCT_ID'])

                # Write drug node
                if drug_id not in seen_drugs:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            drug_id,
                            items_dict['DRUG_NAME'],
                            drug_node_type,
                            '',  # TDL (not applicable for drugs)
                            self.source_name
                        ])
                    seen_drugs[drug_id] += 1

                for key, (uniprot_id, name, tdl) in protein_dict.items():
                    protein_id = uniprot_curie_prefix + uniprot_id

                    if protein_id not in seen_proteins:
                        write_node_edge_item(fh=node,
                                             header=self.node_header,
                                             data=[
                                                 protein_id, name,
                                                 protein_node_type, tdl,
                                                 self.source_name
                                             ])
                        seen_proteins[protein_id] += 1

                    # WRITE EDGES
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            drug_id, drug_protein_edge_label, protein_id,
                            drug_protein_edge_relation, self.source_name,
                            items_dict['ACT_COMMENT'], 'biolink:Association'
                        ])

        return None
Ejemplo n.º 11
0
    def run(self) -> None:
        """Method is called and performs needed transformations to process the Drug Central data, additional information
     on this data can be found in the comment at the top of this script"""

        interactions_file = os.path.join(self.input_base_dir,
                                         "drug.target.interaction.tsv.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        gene_node_type = "biolink:Gene"
        drug_gene_edge_label = "biolink:interacts_with"
        drug_gene_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'comment'
        ]

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                # get gene ID
                try:
                    gene_id = get_item_by_priority(items_dict, ['ACCESSION'])
                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    logging.info(
                        "No gene information for this line:\n{}\nskipping".
                        format(line))
                    continue

                # get drug ID
                drug_id = get_item_by_priority(
                    items_dict,
                    ['ACT_SOURCE_URL', 'MOA_SOURCE_URL', 'DRUG_NAME'])

                # WRITE NODES
                # drug - ['id', 'name', 'category']
                write_node_edge_item(
                    fh=node,
                    header=self.node_header,
                    data=[drug_id, items_dict['DRUG_NAME'], drug_node_type])

                write_node_edge_item(
                    fh=node,
                    header=self.node_header,
                    data=[gene_id, items_dict['GENE'], gene_node_type])

                # WRITE EDGES
                # ['subject', 'edge_label', 'object', 'relation', 'comment']
                write_node_edge_item(fh=edge,
                                     header=self.edge_header,
                                     data=[
                                         drug_id, drug_gene_edge_label,
                                         gene_id, drug_gene_edge_relation,
                                         items_dict['ACT_COMMENT']
                                     ])

        return None