Example #1
0
    def _process_orthologs(self, raw, limit=None):
        """
        This method maps orthologs for a species to the KEGG orthology classes.

        Triples created:
        <gene_id> is a class
        <orthology_class_id> is a class

        <assoc_id> has subject <gene_id>
        <assoc_id> has object <orthology_class_id>
        :param limit:
        :return:

        """

        LOG.info("Processing orthologs")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        line_counter = 0
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_id, orthology_class_id) = row

                orthology_class_id = 'KEGG:'+orthology_class_id.strip()
                gene_id = 'KEGG:' + gene_id.strip()

                # note that the panther_id references a group of orthologs,
                # and is not 1:1 with the rest

                # add the KO id as a gene-family grouping class
                OrthologyAssoc(
                    graph, self.name, gene_id, None).add_gene_family_to_graph(
                        orthology_class_id)

                # add gene and orthology class to graph;
                # assume labels will be taken care of elsewhere
                model.addClassToGraph(gene_id, None)
                model.addClassToGraph(orthology_class_id, None)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with orthologs")
        return
Example #2
0
    def _get_relations(self, limit):
        """
        This will process each of the specified orthoxml files, and
        extracting the induced orthology and paralogy associations
        based on the specified xml group nodes.

        The specs for orthoxml can be found here: http://orthoxml.org

        We currently extract tripples for orthologous relations,
        paralogous relations and in_taxon relations to NCBITaxonId
        attributes, e.g.

        Triples:
        <protein1_id> RO:othologous <protein2_id>
        <assoc_id> :hasSubject <protein1_id>
        <assoc_id> :hasObject <protein2_id>
        <assoc_id> :hasPredicate <RO:orthologous>
        <assoc_id> dc:evidence ECO:phylogenetic_evidence

        :param limit: limit the number of induced pairwise relations
        :return: None

        """
        logger.info("getting ortholog and paralog relations")

        graph = self.testgraph if self.testMode else self.graph
        model = Model(graph)

        for k in self.files.keys():
            f = os.path.join(self.rawdir, self.files[k]['file'])
            matchcounter = 0
            logger.info("Parsing %s", f)

            time_start = time.time()
            xml = lxml.etree.parse(f)
            parser = OrthoXMLParser(xml)
            logger.info(
                "loaded {} into memory. Took {}sec. Starting to extract relations..."
                .format(f,
                        time.time() - time_start))

            time0, last_cnt = time.time(), 0
            for cnts, (protein_nr_a, protein_nr_b, rel_type) in enumerate(
                    parser.extract_pairwise_relations()):
                protein_a = parser.gene_mapping[protein_nr_a]
                protein_b = parser.gene_mapping[protein_nr_b]

                protein_id_a = protein_a.get('protId')
                protein_id_b = protein_b.get('protId')

                if cnts % 100 == 0 and time.time() - time0 > 30:
                    logger.info(
                        "processed {0:d} rels in {1:.1f}sec: "
                        "{2:.3f}/sec; overall {3:d} in {4:1f}sec "
                        "({5:.3f}/sec); cache ratio: {6.hits}/{6.misses}".
                        format(cnts - last_cnt,
                               time.time() - time0,
                               (cnts - last_cnt) / (time.time() - time0), cnts,
                               time.time() - time_start,
                               cnts / (time.time() - time_start),
                               self.add_protein_to_graph.cache_info()))
                    time0, last_cnt = time.time(), cnts

                if self.testMode and not (protein_id_a in self.test_ids
                                          or protein_id_b in self.test_ids):
                    continue

                matchcounter += 1
                taxon_a = self.extract_taxon_info(protein_a)
                taxon_b = self.extract_taxon_info(protein_b)

                # check if both protein belong to taxa that are selected
                if (self.tax_ids is not None
                        and ((int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip()))
                              not in self.tax_ids) or
                             (int(re.sub(r'NCBITaxon:', '', taxon_b.rstrip()))
                              not in self.tax_ids))):
                    continue

                protein_id_a = self.clean_protein_id(protein_id_a)
                protein_id_b = self.clean_protein_id(protein_id_b)
                # add genes to graph if needed;
                # assume labels will be taken care of elsewhere
                self.add_protein_to_graph(protein_id_a, taxon_a, model)
                self.add_protein_to_graph(protein_id_b, taxon_b, model)

                rel = self.seelf.globaltt[rel_type]
                evidence_id = self.globaltt[
                    'phylogenetic evidence']  # 'ECO:0000080'
                # add the association and relevant nodes to graph
                assoc = OrthologyAssoc(graph, self.name, protein_id_a,
                                       protein_id_b, rel)
                assoc.add_evidence(evidence_id)
                assoc.add_association_to_graph()

                if not self.testMode and limit is not None and matchcounter > limit:
                    logger.warning(
                        "reached limit of relations to extract. Stopping early..."
                    )
                    break
                    # make report on unprocessed_gene_ids

            logger.info("finished processing %s", f)
        return
Example #3
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """

        logger.info("getting gene groups")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['gene_group']['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}

        with gzip.open(f, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')

            for row in filereader:
                # skip comment lines
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                (tax_a, gene_a, rel, tax_b, gene_b) = row

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        logger.debug("Finished hashing gene groups")
        logger.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for o in orthologs:
                            oid = 'NCBIGene:' + str(o)
                            model.addClassToGraph(oid, None,
                                                  Genotype.genoparts['gene'])
                            otaxid = 'NCBITaxon:' + str(gene_to_taxon[o])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        logger.info("Made %d orthology relationships for %d genes",
                    found_counter, len(gene_ids))
        return
Example #4
0
    def _get_relations(self, limit):
        """
        This will process each of the specified orthoxml files, and
        extracting the induced orthology and paralogy associations
        based on the specified xml group nodes.

        The specs for orthoxml can be found here: http://orthoxml.org

        We currently extract tripples for orthologous relations,
        paralogous relations and in_taxon relations to NCBITaxonId
        attributes, e.g.

        Triples:
        <protein1_id> RO:othologous <protein2_id>
        <assoc_id> :hasSubject <protein1_id>
        <assoc_id> :hasObject <protein2_id>
        <assoc_id> :hasPredicate <RO:orthologous>
        <assoc_id> dc:evidence ECO:phylogenetic_evidence

        :param limit: limit the number of induced pairwise relations
        :return: None

        """
        logger.info("getting ortholog and paralog relations")

        g = self.testgraph if self.testMode else self.graph
        model = Model(g)

        for k in self.files.keys():
            f = os.path.join(self.rawdir, self.files[k]['file'])
            matchcounter = 0
            logger.info("Parsing %s", f)

            time_start = time.time()
            xml = lxml.etree.parse(f)
            parser = OrthoXMLParser(xml)
            logger.info("loaded {} into memory. Took {}sec to load. Starting to extract relations..."
                        .format(f, time.time()-time_start))

            time0, last_cnt = time.time(), 0
            for cnts, (protein_nr_a, protein_nr_b, rel_type) in enumerate(parser.extract_pairwise_relations()):
                protein_a = parser.gene_mapping[protein_nr_a]
                protein_b = parser.gene_mapping[protein_nr_b]

                protein_id_a = protein_a.get('protId')
                protein_id_b = protein_b.get('protId')

                if cnts % 100 == 0 and time.time()-time0 > 30:
                    logger.info("processed {0:d} rels in {1:.1f}sec: {2:.3f}/sec; overall {3:d} in "
                                "{4:1f}sec ({5:.3f}/sec); cache ratio: {6.hits}/{6.misses}"
                                .format(cnts-last_cnt, time.time()-time0, (cnts-last_cnt)/(time.time()-time0),
                                        cnts, time.time()-time_start, cnts/(time.time()-time_start),
                                        self.add_protein_to_graph.cache_info()))
                    time0, last_cnt = time.time(), cnts

                if self.testMode and not \
                        (protein_id_a in self.test_ids or protein_id_b in self.test_ids):
                    continue

                matchcounter += 1
                taxon_a = self.extract_taxon_info(protein_a)
                taxon_b = self.extract_taxon_info(protein_b)

                # check if both protein belong to taxa that are selected
                if (self.tax_ids is not None and (
                    (int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip()))
                            not in self.tax_ids) or
                    (int(re.sub(r'NCBITaxon:', '', taxon_b.rstrip()))
                            not in self.tax_ids))):
                        continue

                protein_id_a = self.clean_protein_id(protein_id_a)
                protein_id_b = self.clean_protein_id(protein_id_b)
                # add genes to graph if needed;
                # assume labels will be taken care of elsewhere
                self.add_protein_to_graph(protein_id_a, taxon_a, model)
                self.add_protein_to_graph(protein_id_b, taxon_b, model)

                rel = self._map_orthology_code_to_RO[rel_type]
                evidence_id = 'ECO:0000080'  # phylogenetic evidence
                # add the association and relevant nodes to graph
                assoc = OrthologyAssoc(g, self.name, protein_id_a, protein_id_b, rel)
                assoc.add_evidence(evidence_id)
                assoc.add_association_to_graph()

                if not self.testMode \
                        and limit is not None and matchcounter > limit:
                    logger.warning("reached limit of relations to extract. Stopping early...")
                    break
                    # make report on unprocessed_gene_ids

            logger.info("finished processing %s", f)
        return
Example #5
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """
        src_key = 'gene_group'
        LOG.info("getting gene groups")
        src_file = '/'.join((self.rawdir, self.files[src_key]['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}
        col = self.files[src_key]['columns']

        with gzip.open(src_file, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip octothorp
            if not self.check_fileheader(col, row):
                pass
            for row in tsv:
                row = row.decode().strip().split('\t')
                tax_a = row[col.index('tax_id')]
                gene_a = row[col.index('GeneID')].strip()
                rel = row[col.index('relationship')]
                tax_b = row[col.index('Other_tax_id')]
                gene_b = row[col.index('Other_GeneID')].strip()

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        LOG.debug("Finished hashing gene groups")
        LOG.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for orth in orthologs:
                            oid = 'NCBIGene:' + str(orth)
                            model.addClassToGraph(oid, None,
                                                  self.globaltt['gene'])
                            otaxid = 'NCBITaxon:' + str(gene_to_taxon[orth])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        LOG.info("Made %d orthology relationships for %d genes", found_counter,
                 len(gene_ids))
Example #6
0
    def _get_orthologs(self, limit):
        """
        This will process each of the specified pairwise orthology files,
        creating orthology associations based on the specified orthology code.
        this currently assumes that each of the orthology files is identically
        formatted. Relationships are made between genes here.

        There is also a nominal amount of identifier re-formatting:
        MGI:MGI --> MGI
        Ensembl --> ENSEMBL

        we skip any genes where we don't know how to map the gene identifiers.
        For example, Gene:Huwe1 for RAT is not an identifier, so we skip any
        mappings to this identifier.  Often, the there are two entries for the
        same gene (base on equivalent Uniprot id), and so we are not actually
        losing any information.

        We presently have a hard-coded filter to select only orthology
        relationships where one of the pair is in our species of interest
        (Mouse and Human, for the moment).
        This will be added as a configurable parameter in the future.

        Genes are also added to a grouping class defined with a PANTHER id.

        Triples:
        <gene1_id> RO:othologous <gene2_id>
        <assoc_id> :hasSubject <gene1_id>
        <assoc_id> :hasObject <gene2_id>
        <assoc_id> :hasPredicate <RO:orthologous>
        <assoc_id> dc:evidence ECO:phylogenetic_evidence

        <panther_id> a DATA:gene_family
        <panther_id> RO:has_member <gene1_id>
        <panther_id> RO:has_member <gene2_id>

        :param limit:
        :return:

        """
        logger.info("getting orthologs")

        if self.testMode:
            g = self.testgraph

        else:
            g = self.graph
        model = Model(g)
        unprocessed_gene_ids = set()  # may be faster to make a set after

        for k in self.files.keys():
            f = '/'.join((self.rawdir, self.files[k]['file']))
            matchcounter = 0
            mytar = tarfile.open(f, 'r:gz')

            # assume that the first entry is the item
            fname = mytar.getmembers()[0]
            logger.info("Parsing %s", fname.name)
            line_counter = 0
            with mytar.extractfile(fname) as csvfile:
                for line in csvfile:
                    # skip comment lines
                    if re.match(r'^#', line.decode()):
                        logger.info("Skipping header line")
                        continue
                    line_counter += 1

                    # a little feedback to the user since there's so many
                    if line_counter % 1000000 == 0:
                        logger.info(
                            "Processed %d lines from %s",
                            line_counter, fname.name)

                    line = line.decode().strip()

                    # parse each row. ancestor_taxon is unused
                    # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83
                    #   	MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6
                    #       	LDO	Euarchontoglires	PTHR15964
                    (a, b, orthology_class, ancestor_taxon,
                     panther_id) = line.split('\t')
                    (species_a, gene_a, protein_a) = a.split('|')
                    (species_b, gene_b, protein_b) = b.split('|')

                    # skip the entries that don't have homolog relationships
                    # with the test ids
                    if self.testMode and not (
                            re.sub(r'UniProtKB=', '',
                                   protein_a) in self.test_ids or
                            re.sub(r'UniProtKB=', '', protein_b)
                            in self.test_ids):
                        continue

                    # map the taxon abbreviations to ncbi taxon ids
                    taxon_a = self._map_taxon_abbr_to_id(species_a)
                    taxon_b = self._map_taxon_abbr_to_id(species_b)

                    # ###uncomment the following code block
                    # if you want to filter based on taxid of favorite animals
                    # taxids = [9606,10090,10116,7227,7955,6239,8355]
                    # taxids = [9606] #human only
                    # retain only those orthologous relationships to genes
                    # in the specified taxids
                    # using AND will get you only those associations where
                    # gene1 AND gene2 are in the taxid list (most-filter)
                    # using OR will get you any associations where
                    # gene1 OR gene2 are in the taxid list (some-filter)
                    if (
                        self.tax_ids is not None and
                        (int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip()))
                            not in self.tax_ids) and
                        (int(re.sub(
                            r'NCBITaxon:', '', taxon_b.rstrip())) not in
                            self.tax_ids)):
                        continue
                    else:
                        matchcounter += 1
                        if limit is not None and matchcounter > limit:
                            break

                    # ### end code block for filtering on taxon

                    # fix the gene identifiers
                    gene_a = re.sub(r'=', ':', gene_a)
                    gene_b = re.sub(r'=', ':', gene_b)

                    clean_gene = self._clean_up_gene_id(gene_a, species_a)
                    if clean_gene is None:
                        unprocessed_gene_ids.add(gene_a)
                    gene_a = clean_gene
                    clean_gene = self._clean_up_gene_id(gene_b, species_b)
                    if clean_gene is None:
                        unprocessed_gene_ids.add(gene_b)
                    gene_b = clean_gene

                    # a special case here; mostly some rat genes
                    # they use symbols instead of identifiers.  will skip
                    if gene_a is None or gene_b is None:
                        continue

                    rel = self._map_orthology_code_to_RO(orthology_class)

                    evidence_id = 'ECO:0000080'  # phylogenetic evidence

                    # add the association and relevant nodes to graph
                    assoc = OrthologyAssoc(g, self.name, gene_a, gene_b, rel)
                    assoc.add_evidence(evidence_id)

                    # add genes to graph;
                    # assume labels will be taken care of elsewhere
                    model.addClassToGraph(gene_a, None)
                    model.addClassToGraph(gene_b, None)

                    # might as well add the taxon info for completeness
                    g.addTriple(
                        gene_a, model.object_properties['in_taxon'], taxon_a)
                    g.addTriple(
                        gene_b, model.object_properties['in_taxon'], taxon_b)

                    assoc.add_association_to_graph()

                    # note this is incomplete...
                    # it won't construct the full family hierarchy,
                    # just the top-grouping
                    assoc.add_gene_family_to_graph(
                        ':'.join(('PANTHER', panther_id)))

                    if not self.testMode \
                            and limit is not None and line_counter > limit:
                        break
                # make report on unprocessed_gene_ids

            logger.info("finished processing %s", f)
            logger.warning(
                "The following gene ids were unable to be processed: %s",
                str(unprocessed_gene_ids))

        return
Example #7
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """

        logger.info("getting gene groups")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['gene_group']['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}

        with gzip.open(f, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""),
                delimiter='\t',
                quotechar='\"')

            for row in filereader:
                # skip comment lines
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                (tax_a, gene_a, rel, tax_b, gene_b) = row

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        logger.debug("Finished hashing gene groups")
        logger.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for o in orthologs:
                            oid = 'NCBIGene:'+str(o)
                            model.addClassToGraph(
                                oid, None, Genotype.genoparts['gene'])
                            otaxid = 'NCBITaxon:'+str(gene_to_taxon[o])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        logger.info(
            "Made %d orthology relationships for %d genes",
            found_counter, len(gene_ids))
        return
Example #8
0
    def _get_orthologs(self, src_key, limit):
        """
        This will process each of the specified pairwise orthology files,
        creating orthology associations based on the specified orthology code.
        this currently assumes that each of the orthology files is identically
        formatted. Relationships are made between genes here.

        There is also a nominal amount of identifier re-formatting:
        MGI:MGI --> MGI
        Ensembl --> ENSEMBL

        we skip any genes where we don't know how to map the gene identifiers.
        For example, Gene:Huwe1 for RAT is not an identifier, so we skip any
        mappings to this identifier.  Often, the there are two entries for the
        same gene (base on equivalent Uniprot id), and so we are not actually
        losing any information.

        We presently have a filter to select only orthology relationships where
        each of the pair is found in self.tax_ids.

        Genes are also added to a grouping class defined with a PANTHER id.

        Triples:
        <gene1_id> RO:othologous <gene2_id>
        <assoc_id> :hasSubject <gene1_id>
        <assoc_id> :hasObject <gene2_id>
        <assoc_id> :hasPredicate <RO:orthologous>
        <assoc_id> dcterms:evidence ECO:phylogenetic_evidence

        <panther_id> rdf:type DATA:gene_family
        <panther_id> RO:has_member <gene1_id>
        <panther_id> RO:has_member <gene2_id>

        :param limit:
        :return:

        """
        LOG.info("reading orthologs")

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        unprocessed_gene_ids = []

        src_file = '/'.join((self.rawdir, self.files[src_key]['file']))
        matchcounter = line_counter = 0
        col = self.files[src_key]['columns']
        reader = tarfile.open(src_file, 'r:gz')

        LOG.info("Parsing %s", src_key)

        with reader.extractfile(src_key) as csvfile:
            # there are no comments or headers
            for line in csvfile:
                # a little feedback to the user since there's so many ... bah strace
                # if line_counter % 1000000 == 0:
                #    LOG.info("Processed %d lines from %s", line_counter, fname.name)

                # parse each row. ancestor_taxons is unused
                # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83
                #   	MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6
                #       	LDO	Euarchontoglires	PTHR15964

                row = line.decode().split('\t')
                thing1 = row[col.index('Gene')].strip()
                thing2 = row[col.index('Ortholog')].strip()
                orthology_type = row[col.index('Type of ortholog')].strip()
                # ancestor_taxons  = row[
                #    col.index('Common ancestor for the orthologs')].strip()
                panther_id = row[
                    col.index('Panther Ortholog ID')].strip()

                (species_a, gene_a, protein_a) = thing1.split('|')
                (species_b, gene_b, protein_b) = thing2.split('|')

                # for testing skip entries without homolog relationships to test ids
                if self.test_mode and not (
                        protein_a[9:] in self.test_ids or
                        protein_b[9:] in self.test_ids):
                    continue

                # map the species abbreviations to ncbi taxon id numbers
                taxon_a = self.resolve(species_a).split(':')[1].strip()
                taxon_b = self.resolve(species_b).split(':')[1].strip()

                # ###
                # keep orthologous relationships to genes in the given tax_ids
                # using AND will get you only those associations where
                # gene1 AND gene2 are in the taxid list (most-filter)
                # using OR will get you any associations where
                # gene1 OR gene2 are in the taxid list (some-filter)
                if self.tax_ids is not None and (
                        taxon_a not in self.tax_ids) and (
                        taxon_b not in self.tax_ids):
                    continue
                else:
                    matchcounter += 1
                    if limit is not None and matchcounter > limit:
                        break

                # ### end code block for filtering on taxon

                # fix the gene identifiers
                gene_a = re.sub(r'=', ':', gene_a)
                gene_b = re.sub(r'=', ':', gene_b)

                clean_gene = self._clean_up_gene_id(gene_a, species_a)
                if clean_gene is None:
                    unprocessed_gene_ids.append(gene_a)
                    continue
                gene_a = clean_gene
                clean_gene = self._clean_up_gene_id(gene_b, species_b)
                if clean_gene is None:
                    unprocessed_gene_ids.append(gene_b)
                    continue
                gene_b = clean_gene

                rel = self.resolve(orthology_type)

                evidence_id = self.globaltt['phylogenetic evidence']

                # add the association and relevant nodes to graph
                assoc = OrthologyAssoc(graph, self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence_id)

                # add genes to graph;  assume labels will be taken care of elsewhere
                model.addType(gene_a, self.globaltt['gene'])
                model.addType(gene_b, self.globaltt['gene'])

                # might as well add the taxon info for completeness
                graph.addTriple(
                    gene_a, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_a
                )
                graph.addTriple(
                    gene_b, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_b
                )

                assoc.add_association_to_graph(
                    blv.terms['GeneToGeneHomologyAssociation']
                )

                # note this is incomplete...
                # it won't construct the full family hierarchy,
                # just the top-grouping
                assoc.add_gene_family_to_graph('PANTHER:' + panther_id)

                if not self.test_mode and\
                        limit is not None and line_counter > limit:
                    break

            LOG.info("finished processing %s", src_file)
            LOG.warning(
                "The following gene ids were unable to be processed: %s",
                str(set(unprocessed_gene_ids)))
Example #9
0
    def _get_orthologs(self, limit):
        """
        This will process each of the specified pairwise orthology files, creating orthology associations
        based on the specified orthology code.
        this currently assumes that each of the orthology files is identically formatted.
        relationships are made between genes here.

        there is also a nominal amount of identifier re-formatting:
        MGI:MGI --> MGI
        Ensembl --> ENSEMBL

        we skip any genes where we don't know how to map the gene identifiers.  for example,
        Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier.  Often, the
        there are two entries for the same gene (base on equivalent Uniprot id), and so we are not
        actually losing any information.

        We presently have a hard-coded filter to select only orthology relationships where one of the pair
        is in our species of interest (Mouse and Human, for the moment).  This will be added as a
        configurable parameter in the future.

        Genes are also added to a grouping class defined with a PANTHER id.

        Triples:
        <gene1_id> RO:othologous <gene2_id>
        <assoc_id> :hasSubject <gene1_id>
        <assoc_id> :hasObject <gene2_id>
        <assoc_id> :hasPredicate <RO:orthologous>
        <assoc_id> dc:evidence ECO:phylogenetic_evidence

        <panther_id> a DATA:gene_family
        <panther_id> RO:has_member <gene1_id>
        <panther_id> RO:has_member <gene2_id>

        :param limit:
        :return:
        """
        logger.info("getting orthologs")

        if self.testMode:
            g = self.testgraph

        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        unprocessed_gene_ids = set()

        for k in self.files.keys():
            f = '/'.join((self.rawdir, self.files[k]['file']))
            matchcounter = 0
            mytar = tarfile.open(f, 'r:gz')

            # assume that the first entry is the item
            fname = mytar.getmembers()[0]
            logger.info("Parsing %s", fname.name)
            line_counter = 0
            with mytar.extractfile(fname) as csvfile:
                for line in csvfile:
                    # skip comment lines
                    if re.match('^#', line.decode()):
                        logger.info("Skipping header line")
                        continue
                    line_counter += 1

                    # a little feedback to the user since there's so many
                    if line_counter % 1000000 == 0:
                        logger.info("Processed %d lines from %s", line_counter, fname.name)

                    line = line.decode().strip()

                    # parse each row
                    # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83	MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6	LDO	Euarchontoglires	PTHR15964
                    (a, b, orthology_class, ancestor_taxon, panther_id) = line.split('\t')
                    (species_a, gene_a, protein_a) = a.split('|')
                    (species_b, gene_b, protein_b) = b.split('|')

                    # skip the entries that don't have homolog relationships with the test ids
                    if self.testMode and not (re.sub('UniProtKB=', '', protein_a) in self.test_ids or
                                              re.sub('UniProtKB=', '', protein_b) in self.test_ids):
                        continue

                    # map the taxon abbreviations to ncbi taxon ids
                    taxon_a = self._map_taxon_abbr_to_id(species_a)
                    taxon_b = self._map_taxon_abbr_to_id(species_b)

                    # ###uncomment the following code block if you want to filter based on taxid
                    # taxids = [9606,10090,10116,7227,7955,6239,8355]  #our favorite animals
                    # taxids = [9606] #human only
                    # retain only those orthologous relationships to genes in the specified taxids
                    # using AND will get you only those associations where gene1 AND gene2 are in the taxid list (most-filter)
                    # using OR will get you any associations where gene1 OR gene2 are in the taxid list (some-filter)
                    if (self.tax_ids is not None and
                        (int(re.sub('NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) and
                            (int(re.sub('NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids)):
                        continue
                    else:
                        matchcounter += 1
                        if limit is not None and matchcounter > limit:
                            break

                    # ###end code block for filtering on taxon

                    # fix the gene identifiers
                    gene_a = re.sub('=', ':', gene_a)
                    gene_b = re.sub('=', ':', gene_b)

                    clean_gene = self._clean_up_gene_id(gene_a, species_a)
                    if clean_gene is None:
                        unprocessed_gene_ids.add(gene_a)
                    gene_a = clean_gene
                    clean_gene = self._clean_up_gene_id(gene_b, species_b)
                    if clean_gene is None:
                        unprocessed_gene_ids.add(gene_b)
                    gene_b = clean_gene

                    # a special case here; mostly some rat genes they use symbols instead of identifiers.  will skip
                    if gene_a is None or gene_b is None:
                        continue

                    rel = self._map_orthology_code_to_RO(orthology_class)

                    evidence_id = 'ECO:0000080'  # phylogenetic evidence

                    # add the association and relevant nodes to graph
                    assoc = OrthologyAssoc(self.name, gene_a, gene_b, rel)
                    assoc.add_evidence(evidence_id)

                    # add genes to graph; assume labels will be taken care of elsewhere
                    gu.addClassToGraph(g, gene_a, None)
                    gu.addClassToGraph(g, gene_b, None)

                    assoc.add_association_to_graph(g)

                    # note this is incomplete... it won't construct the full family hierarchy, just the top-grouping
                    assoc.add_gene_family_to_graph(g, ':'.join(('PANTHER', panther_id)))

                    if not self.testMode and limit is not None and line_counter > limit:
                        break

            logger.info("finished processing %s", f)
            logger.warn("The following gene ids were unable to be processed: %s", str(unprocessed_gene_ids))

        gu.loadProperties(g, OrthologyAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, OrthologyAssoc.annotation_properties, gu.ANNOTPROP)

        return
Example #10
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """
        src_key = 'gene_group'
        LOG.info("getting gene groups")
        src_file = '/'.join((self.rawdir, self.files[src_key]['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}
        col = self.files[src_key]['columns']

        with gzip.open(src_file, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip octothorp
            if not self.check_fileheader(col, row):
                pass
            for row in tsv:
                row = row.decode().strip().split('\t')
                tax_a = row[col.index('tax_id')]
                gene_a = row[col.index('GeneID')]
                rel = row[col.index('relationship')]
                tax_b = row[col.index('Other_tax_id')]
                gene_b = row[col.index('Other_GeneID')]

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        LOG.debug("Finished hashing gene groups")
        LOG.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for orth in orthologs:
                            oid = 'NCBIGene:' + str(orth)
                            model.addClassToGraph(oid, None, self.globaltt['gene'])
                            otaxid = 'NCBITaxon:' + str(gene_to_taxon[orth])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        LOG.info(
            "Made %d orthology relationships for %d genes",
            found_counter, len(gene_ids))