Example #1
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Args:
            :param limit (int, optional) limit the number of rows processed
        Returns:
            :return None
        """
        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        protein_paths = self._get_file_paths(self.tax_ids, 'protein_links')
        
        for taxon in protein_paths:
            ensembl = Ensembl(self.graph_type, self.are_bnodes_skized)
            string_file_path = '/'.join((
                self.rawdir, protein_paths[taxon]['file']))

            fh = gzip.open(string_file_path, 'rb')
            dataframe = pd.read_csv(fh, sep='\s+')
            fh.close()
            p2gene_map = dict()

            if taxon in self.id_map_files:
                map_file = '/'.join((
                    self.rawdir, self.id_map_files[taxon]['file']))

                mfile_handle = open(map_file, 'r')
                if taxon == 9606:
                    for line in mfile_handle.readlines():
                        gene, prot = line.rstrip("\n").split("\t")
                        p2gene_map[prot.replace('9606.', '')] \
                            = "NCBIGene:{}".format(gene)
                else:
                    for line in mfile_handle.readlines():
                        prot, gene = line.rstrip("\n").split("\t")
                        p2gene_map[prot] = gene
                mfile_handle.close()
            else:
                logger.info("Fetching ensembl proteins "
                            "for taxon {}".format(taxon))
                p2gene_map = ensembl.fetch_protein_gene_map(taxon)
                for key in p2gene_map.keys():
                    p2gene_map[key] = "ENSEMBL:{}".format(p2gene_map[key])
            if taxon == 9606:
                temp_map = ensembl.fetch_protein_gene_map(taxon)
                for key in temp_map:
                    if key not in p2gene_map:
                        p2gene_map[key] = "ENSEMBL:{}".format(temp_map[key])

            logger.info("Finished fetching ENSP ID mappings, "
                        "fetched {} proteins".format(len(p2gene_map)))

            logger.info("Fetching protein protein interactions "
                        "for taxon {}".format(taxon))

            self._process_protein_links(dataframe, p2gene_map, taxon, limit)
Example #2
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Args:
            :param limit (int, optional) limit the number of rows processed
        Returns:
            :return None
        """
        if limit is not None:
            LOG.info("Only parsing first %d rows", limit)

        protein_paths = self._get_file_paths(self.tax_ids, 'protein_links')
        col = ['NCBI taxid', 'entrez', 'STRING']
        for taxon in protein_paths:
            ensembl = Ensembl(self.graph_type, self.are_bnodes_skized)
            string_file_path = '/'.join((
                self.rawdir, protein_paths[taxon]['file']))

            with gzip.open(string_file_path, 'rb') as reader:
                dataframe = pd.read_csv(reader, sep=r'\s+')
            p2gene_map = dict()

            if taxon in self.id_map_files:
                LOG.info("Using string provided id_map files")
                map_file = '/'.join((self.rawdir, self.id_map_files[taxon]['file']))

                with gzip.open(map_file, 'rt') as reader:
                    line = next(reader).strip()
                    if line != '# NCBI taxid / entrez / STRING':
                        LOG.error(
                            'Expected Headers:\t%s\nRecived Headers:\t%s\n', col, line)
                        exit(-1)

                    for line in reader.readlines():
                        row = line.rstrip('\n').split('\t')
                        # tax = row[col.index(''NCBI taxid')].strip()
                        gene = row[col.index('entrez')].strip()
                        prot = row[col.index('STRING')].strip()

                        genes = gene.split('|')
                        p2gene_map[prot.replace(taxon + '.', '')] = [
                            "NCBIGene:" + entrez_id for entrez_id in genes]
            else:
                LOG.info("Fetching ensembl proteins for taxon %s", taxon)
                p2gene_map = ensembl.fetch_protein_gene_map(taxon)
                for key in p2gene_map:
                    for phen, gene in enumerate(p2gene_map[key]):
                        p2gene_map[key][phen] = "ENSEMBL:{}".format(gene)

            LOG.info(
                "Finished fetching ENSP ID mappings, fetched %i proteins",
                len(p2gene_map))

            LOG.info(
                "Fetching protein protein interactions for taxon %s", taxon)

            self._process_protein_links(dataframe, p2gene_map, taxon, limit)
Example #3
0
class EnsemblTestCase(SourceTestCase):

    def setUp(self):
        self.source = Ensembl('rdf_graph', True)
        self.source.test_ids = self.source.all_test_ids['gene']
        self.source.settestonly(True)
        self._setDirToSource()
        return

    def tearDown(self):
        self.source = None
        return
Example #4
0
class EnsemblTestCase(SourceTestCase):

    def setUp(self):
        self.source = Ensembl()
        self.source.test_ids = self._get_conf()['test_ids']['gene']
        self.source.settestonly(True)
        self._setDirToSource()
        return

    def tearDown(self):
        self.source = None
        return
Example #5
0
    def setUp(self):
        self.test_util = TestUtils()
        # Test set with two proteins from same species
        self.test_set_1 = [[
            '9606.ENSP00000000233', '9606.ENSP00000003084',
            0, 0, 0, 0, 300, 0, 150, 800]]

        # Test set with deprecated protein id
        self.test_set_2 = [[
            '9606.ENSP00000000233', '9606.ENSP00000006101',
            0, 0, 0, 0, 300, 0, 150, 800]]

        self.columns = [
            'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence',
            'coexpression', 'experimental', 'database', 'textmining', 'combined_score']

        ensembl = Ensembl('rdf_graph', True)
        self.protein_list = ensembl.fetch_protein_gene_map('9606')

        return
Example #6
0
    def testFakeDataSet1(self):
        string_db = StringDB('rdf_graph', True)
        string_db.graph = RDFGraph(True)
        self.assertEqual(len(string_db.graph), 0)

        ensembl = Ensembl('rdf_graph', True)
        prot_map = ensembl.fetch_protein_gene_map(9606)
        for key in prot_map.keys():
            prot_map[key] = "ENSEMBL:{}".format(prot_map[key])

        print("Finished fetching ENSP IDs, "
              "fetched {} proteins".format(len(prot_map.keys())))
        dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns)

        string_db._process_protein_links(dataframe, prot_map, 9606)

        triples = """
            ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 .
        """
        self.assertTrue(self.test_util.test_graph_equality(
            triples, string_db.graph))
Example #7
0
    def testFakeDataSet1(self):
        string_db = StringDB('rdf_graph', True)
        string_db.graph.bind_all_namespaces()
        ensembl = Ensembl('rdf_graph', True)
        prot_map = ensembl.fetch_protein_gene_map(9606)
        for key in prot_map.keys():
            prot_map[key] = "ENSEMBL:{}".format(prot_map[key])

        print("Finished fetching ENSP IDs, "
              "fetched {} proteins".format(len(prot_map.keys())))
        dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns)

        string_db._process_protein_links(dataframe, prot_map, 9606)

        sparql_query = """
                      SELECT ?prot
                      WHERE {
                          ?prot RO:0002434 ENSEMBL:ENSG00000004059 .
                      }
                      """
        sparql_output = string_db.graph.query(sparql_query)
        results = list(sparql_output)
        expected = [(URIRef(string_db.graph._getNode("ENSEMBL:ENSG00000001626")),)]
        self.assertEqual(results, expected)
Example #8
0
 def setUp(self):
     self.source = Ensembl('rdf_graph', True)
     self.source.test_ids = self.source.all_test_ids['gene']
     self.source.settestonly(True)
     self._setDirToSource()
     return
Example #9
0
 def setUp(self):
     self.source = Ensembl()
     self.source.test_ids = self._get_conf()['test_ids']['gene']
     self.source.settestonly(True)
     self._setDirToSource()
     return
Example #10
0
def main():

    """
    Zebrafish:
        1. Map ENSP to ZFIN Ids using Intermine
        2. Map deprecated ENSP IDs to ensembl genes
           by querying the ensembl database then use
           intermine to resolve to gene IDs
    Mouse: Map deprecated ENSP IDs to ensembl genes
           by querying the ensembl database then use
           intermine to resolve to MGI IDs
    Fly: ENSP IDs appear as xrefs on translation IDs
    Worm: Use UniProt Mapping file provided by String
    """

    parser = argparse.ArgumentParser(usage=__doc__)
    parser.add_argument('--config', '-c', required=True, help='JSON configuration file')
    parser.add_argument('--out', '-o', required=False, help='output directory', default="./")
    parser.add_argument('--use_cache', '-cached', action="store_true",
                        required=False, help='use cached files', default=False)
    args = parser.parse_args()

    # Hardcoded dir for raw files
    out_path = Path(args.out)
    raw_dir = out_path / "out"
    raw_dir.mkdir(parents=True, exist_ok=True)

    # Hardcoded unmapped file

    VERSION = 'v10.5'
    STRING_BASE = "http://string-db.org/download/" \
                  "protein.links.detailed.{}".format(VERSION)

    config_file = open(args.config, 'r')
    config = yaml.load(config_file)
    config_file.close()

    out_unmapped_file = out_path / "unmapped_ids.tsv"
    unmapped_file = out_unmapped_file.open("w")

    # Connect to ensembl
    connection = connect_to_database(host=config['database']['host'],
                                     username=config['database']['username'],
                                     port=config['database']['port'])

    cursor = connection.cursor()

    # Process MGI eqs #
    ####################
    taxon = config['taxa_specific']['mouse']['tax_id']

    # IO
    dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \
        .format(taxon, VERSION)

    mouse_map_file = out_path / config['taxa_specific']['mouse']['output_file']
    mouse_file = mouse_map_file.open('w')

    path = '{}/{}.protein.links.detailed.{}.txt.gz' \
        .format(STRING_BASE, taxon, VERSION)
    if not args.use_cache:
        download_file(path, dump_file)

    ensembl = Ensembl("rdf_graph", True)
    p2gene_map = ensembl.fetch_protein_gene_map(taxon)

    fh = gzip.open(str(dump_file), 'rb')
    df = pd.read_csv(fh, sep='\s+')
    fh.close()
    proteins = pd.unique(df[['protein1', 'protein2']].values.ravel())
    logger.info("Processing {} proteins".format(len(proteins)))
    for protein in proteins:
        prot = protein.replace('{}.'.format(str(taxon)), '')
        try:
            ens_gene = p2gene_map[prot]
            ens_curie = "ENSEMBL:{}".format(ens_gene)
            mouse_file.write("{}\t{}\n".format(prot, ens_curie))
            continue
        except KeyError:
            pass

        ens_gene = get_deprecated_protein_gene_rel(
            cursor, prot, config['taxa_specific']['mouse']['ensembl'],
            config)
        intermine_resp = query_mousemine(
            config['taxa_specific']['mouse']['intermine'], ens_gene)
        if intermine_resp.is_successful:
            mouse_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id))
        else:
            unmapped_file.write("{}\t{}\t{}\n".format(prot, ens_gene, taxon))

    mouse_file.close()

    # Process Fly eqs #
    ####################
    taxon = config['taxa_specific']['fly']['tax_id']

    # IO
    dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \
        .format(taxon, VERSION)

    fly_map_file = out_path / config['taxa_specific']['fly']['output_file']
    fly_file = fly_map_file.open('w')

    path = '{}/{}.protein.links.detailed.{}.txt.gz' \
        .format(STRING_BASE, taxon, VERSION)
    if not args.use_cache:
        download_file(path, dump_file)

    ensembl = Ensembl("rdf_graph", True)
    p2gene_map = ensembl.fetch_protein_gene_map(taxon)

    fh = gzip.open(str(dump_file), 'rb')
    df = pd.read_csv(fh, sep='\s+')
    fh.close()
    proteins = pd.unique(df[['protein1', 'protein2']].values.ravel())
    logger.info("Processing {} proteins".format(len(proteins)))
    for protein in proteins:
        prot = protein.replace('{}.'.format(str(taxon)), '')
        try:
            ens_gene = p2gene_map[prot]
            ens_curie = "ENSEMBL:{}".format(ens_gene)
            fly_file.write("{}\t{}\n".format(prot, ens_curie))
            continue
        except KeyError:
            pass

        ens_gene = get_xref_protein_gene_rel(
            cursor, prot, config['taxa_specific']['fly']['ensembl'],
            config, taxon)

        if ens_gene is not None:
            fly_file.write("{}\t{}\n".format(prot, "ENSEMBL:{}".format(ens_gene)))
        else:
            unmapped_file.write("{}\t{}\t{}\n".format(prot, '', taxon))

    fly_file.close()

    # Process Worm eqs #
    ####################
    taxon = config['taxa_specific']['worm']['tax_id']

    # IO
    dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \
        .format(taxon, VERSION)

    uniprot_file = raw_dir / config['taxa_specific']['worm']['uniprot_file']

    worm_map_file = out_path / config['taxa_specific']['worm']['output_file']
    worm_file = worm_map_file.open('w')

    path = '{}/{}.protein.links.detailed.{}.txt.gz' \
        .format(STRING_BASE, taxon, VERSION)
    if not args.use_cache:
        download_file(path, dump_file)
        download_file(config['taxa_specific']['worm']['uniprot_mappings'],
                      uniprot_file)

    ensembl = Ensembl("rdf_graph", True)
    p2gene_map = ensembl.fetch_protein_gene_map(taxon)
    uni2gene_map = ensembl.fetch_uniprot_gene_map(taxon)

    fh = gzip.open(str(uniprot_file), 'rb')
    df = pd.read_csv(fh, sep='\s+')
    fh.close()
    string_uniprot_map = {}
    for index, row in df.iterrows():
        uniprot_ac = row['uniprot_ac|uniprot_id'].split('|')[0]
        string_uniprot_map[row['string_id']] = uniprot_ac

    fh = gzip.open(str(dump_file), 'rb')
    df = pd.read_csv(fh, sep='\s+')
    fh.close()
    proteins = pd.unique(df[['protein1', 'protein2']].values.ravel())
    logger.info("Processing {} proteins".format(len(proteins)))
    for protein in proteins:
        prot = protein.replace('{}.'.format(str(taxon)), '')
        try:
            ens_gene = p2gene_map[prot]
            ens_curie = "ENSEMBL:{}".format(ens_gene)
            worm_file.write("{}\t{}\n".format(prot, ens_curie))
            continue
        except KeyError:
            pass

        try:
            uniprot_ac = string_uniprot_map[prot]
            ens_gene = uni2gene_map[uniprot_ac]
            ens_curie = "ENSEMBL:{}".format(ens_gene)
            worm_file.write("{}\t{}\n".format(prot, ens_curie))
            continue
        except KeyError:
            pass

        unmapped_file.write("{}\t{}\t{}\n".format(prot, '', taxon))

    worm_file.close()

    # Process ZFIN eqs #
    ####################
    taxon = config['taxa_specific']['zebrafish']['tax_id']

    # IO
    dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \
        .format(taxon, VERSION)

    zfin_map_file = out_path / config['taxa_specific']['zebrafish']['output_file']
    zfin_file = zfin_map_file.open('w')

    path = '{}/{}.protein.links.detailed.{}.txt.gz' \
        .format(STRING_BASE, taxon, VERSION)
    if not args.use_cache:
        download_file(path, dump_file)

    ensembl = Ensembl("rdf_graph", True)
    p2gene_map = ensembl.fetch_protein_gene_map(taxon)

    # in 3.6 gzip accepts Paths
    fh = gzip.open(str(dump_file), 'rb')
    df = pd.read_csv(fh, sep='\s+')
    fh.close()
    proteins = pd.unique(df[['protein1', 'protein2']].values.ravel())
    logger.info("Processing {} proteins".format(len(proteins)))
    for protein in proteins:
        prot = protein.replace('{}.'.format(str(taxon)), '')
        try:
            ens_gene = p2gene_map[prot]
            ens_curie = "ENSEMBL:{}".format(ens_gene)
            zfin_file.write("{}\t{}\n".format(prot, ens_curie))
            continue
        except KeyError:
            pass

        intermine_resp = query_fishmine(
            config['taxa_specific']['zebrafish']['intermine'], prot)
        if intermine_resp.is_successful:
            zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id))
            continue

        ens_gene = get_deprecated_protein_gene_rel(
            cursor, prot, config['taxa_specific']['zebrafish']['ensembl'],
            config)
        intermine_resp = query_fishmine(
            config['taxa_specific']['zebrafish']['intermine'], ens_gene)
        if intermine_resp.is_successful:
            zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id))
            continue

        intermine_resp = query_fishmine(
            config['taxa_specific']['zebrafish']['intermine'],
            ens_gene, "Pseudogene")
        if intermine_resp.is_successful:
            zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id))
        else:
            unmapped_file.write("{}\t{}\t{}\n".format(prot, ens_gene, taxon))

    zfin_file.close()

    unmapped_file.close()
    connection.close()

    logger.info("ID Map Finished")