def _process_orthologs(self, raw, limit=None): """ This method maps orthologs for a species to the KEGG orthology classes. Triples created: <gene_id> is a class <orthology_class_id> is a class <assoc_id> has subject <gene_id> <assoc_id> has object <orthology_class_id> :param limit: :return: """ LOG.info("Processing orthologs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, orthology_class_id) = row orthology_class_id = 'KEGG:'+orthology_class_id.strip() gene_id = 'KEGG:' + gene_id.strip() # note that the panther_id references a group of orthologs, # and is not 1:1 with the rest # add the KO id as a gene-family grouping class OrthologyAssoc( graph, self.name, gene_id, None).add_gene_family_to_graph( orthology_class_id) # add gene and orthology class to graph; # assume labels will be taken care of elsewhere model.addClassToGraph(gene_id, None) model.addClassToGraph(orthology_class_id, None) if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with orthologs") return
def _get_relations(self, limit): """ This will process each of the specified orthoxml files, and extracting the induced orthology and paralogy associations based on the specified xml group nodes. The specs for orthoxml can be found here: http://orthoxml.org We currently extract tripples for orthologous relations, paralogous relations and in_taxon relations to NCBITaxonId attributes, e.g. Triples: <protein1_id> RO:othologous <protein2_id> <assoc_id> :hasSubject <protein1_id> <assoc_id> :hasObject <protein2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence :param limit: limit the number of induced pairwise relations :return: None """ logger.info("getting ortholog and paralog relations") graph = self.testgraph if self.testMode else self.graph model = Model(graph) for k in self.files.keys(): f = os.path.join(self.rawdir, self.files[k]['file']) matchcounter = 0 logger.info("Parsing %s", f) time_start = time.time() xml = lxml.etree.parse(f) parser = OrthoXMLParser(xml) logger.info( "loaded {} into memory. Took {}sec. Starting to extract relations..." .format(f, time.time() - time_start)) time0, last_cnt = time.time(), 0 for cnts, (protein_nr_a, protein_nr_b, rel_type) in enumerate( parser.extract_pairwise_relations()): protein_a = parser.gene_mapping[protein_nr_a] protein_b = parser.gene_mapping[protein_nr_b] protein_id_a = protein_a.get('protId') protein_id_b = protein_b.get('protId') if cnts % 100 == 0 and time.time() - time0 > 30: logger.info( "processed {0:d} rels in {1:.1f}sec: " "{2:.3f}/sec; overall {3:d} in {4:1f}sec " "({5:.3f}/sec); cache ratio: {6.hits}/{6.misses}". format(cnts - last_cnt, time.time() - time0, (cnts - last_cnt) / (time.time() - time0), cnts, time.time() - time_start, cnts / (time.time() - time_start), self.add_protein_to_graph.cache_info())) time0, last_cnt = time.time(), cnts if self.testMode and not (protein_id_a in self.test_ids or protein_id_b in self.test_ids): continue matchcounter += 1 taxon_a = self.extract_taxon_info(protein_a) taxon_b = self.extract_taxon_info(protein_b) # check if both protein belong to taxa that are selected if (self.tax_ids is not None and ((int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) or (int(re.sub(r'NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids))): continue protein_id_a = self.clean_protein_id(protein_id_a) protein_id_b = self.clean_protein_id(protein_id_b) # add genes to graph if needed; # assume labels will be taken care of elsewhere self.add_protein_to_graph(protein_id_a, taxon_a, model) self.add_protein_to_graph(protein_id_b, taxon_b, model) rel = self.seelf.globaltt[rel_type] evidence_id = self.globaltt[ 'phylogenetic evidence'] # 'ECO:0000080' # add the association and relevant nodes to graph assoc = OrthologyAssoc(graph, self.name, protein_id_a, protein_id_b, rel) assoc.add_evidence(evidence_id) assoc.add_association_to_graph() if not self.testMode and limit is not None and matchcounter > limit: logger.warning( "reached limit of relations to extract. Stopping early..." ) break # make report on unprocessed_gene_ids logger.info("finished processing %s", f) return
def add_orthologs_by_gene_group(self, graph, gene_ids): """ This will get orthologies between human and other vertebrate genomes based on the gene_group annotation pipeline from NCBI. More information 9can be learned here: http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/ The method for associations is described in [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/) == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/). Because these are only between human and vertebrate genomes, they will certainly miss out on very distant orthologies, and should not be considered complete. We do not run this within the NCBI parser itself; rather it is a convenience function for others parsers to call. :param graph: :param gene_ids: Gene ids to fetch the orthology :return: """ logger.info("getting gene groups") line_counter = 0 f = '/'.join((self.rawdir, self.files['gene_group']['file'])) found_counter = 0 # because many of the orthologous groups are grouped by human gene, # we need to do this by generating two-way hash # group_id => orthologs # ortholog id => group # this will be the fastest approach, though not memory-efficient. geno = Genotype(graph) model = Model(graph) group_to_orthology = {} gene_to_group = {} gene_to_taxon = {} with gzip.open(f, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: # skip comment lines if re.match(r'\#', ''.join(row)): continue line_counter += 1 (tax_a, gene_a, rel, tax_b, gene_b) = row if rel != 'Ortholog': continue if gene_a not in group_to_orthology: group_to_orthology[gene_a] = set() group_to_orthology[gene_a].add(gene_b) if gene_b not in gene_to_group: gene_to_group[gene_b] = set() gene_to_group[gene_b].add(gene_a) gene_to_taxon[gene_a] = tax_a gene_to_taxon[gene_b] = tax_b # also add the group lead as a member of the group group_to_orthology[gene_a].add(gene_a) # end loop through gene_group file logger.debug("Finished hashing gene groups") logger.debug("Making orthology associations") for gid in gene_ids: gene_num = re.sub(r'NCBIGene:', '', gid) group_nums = gene_to_group.get(gene_num) if group_nums is not None: for group_num in group_nums: orthologs = group_to_orthology.get(group_num) if orthologs is not None: for o in orthologs: oid = 'NCBIGene:' + str(o) model.addClassToGraph(oid, None, Genotype.genoparts['gene']) otaxid = 'NCBITaxon:' + str(gene_to_taxon[o]) geno.addTaxon(otaxid, oid) assoc = OrthologyAssoc(graph, self.name, gid, oid) assoc.add_source('PMID:24063302') assoc.add_association_to_graph() # todo get gene label for orthologs - # this could get expensive found_counter += 1 # finish loop through annotated genes logger.info("Made %d orthology relationships for %d genes", found_counter, len(gene_ids)) return
def _get_relations(self, limit): """ This will process each of the specified orthoxml files, and extracting the induced orthology and paralogy associations based on the specified xml group nodes. The specs for orthoxml can be found here: http://orthoxml.org We currently extract tripples for orthologous relations, paralogous relations and in_taxon relations to NCBITaxonId attributes, e.g. Triples: <protein1_id> RO:othologous <protein2_id> <assoc_id> :hasSubject <protein1_id> <assoc_id> :hasObject <protein2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence :param limit: limit the number of induced pairwise relations :return: None """ logger.info("getting ortholog and paralog relations") g = self.testgraph if self.testMode else self.graph model = Model(g) for k in self.files.keys(): f = os.path.join(self.rawdir, self.files[k]['file']) matchcounter = 0 logger.info("Parsing %s", f) time_start = time.time() xml = lxml.etree.parse(f) parser = OrthoXMLParser(xml) logger.info("loaded {} into memory. Took {}sec to load. Starting to extract relations..." .format(f, time.time()-time_start)) time0, last_cnt = time.time(), 0 for cnts, (protein_nr_a, protein_nr_b, rel_type) in enumerate(parser.extract_pairwise_relations()): protein_a = parser.gene_mapping[protein_nr_a] protein_b = parser.gene_mapping[protein_nr_b] protein_id_a = protein_a.get('protId') protein_id_b = protein_b.get('protId') if cnts % 100 == 0 and time.time()-time0 > 30: logger.info("processed {0:d} rels in {1:.1f}sec: {2:.3f}/sec; overall {3:d} in " "{4:1f}sec ({5:.3f}/sec); cache ratio: {6.hits}/{6.misses}" .format(cnts-last_cnt, time.time()-time0, (cnts-last_cnt)/(time.time()-time0), cnts, time.time()-time_start, cnts/(time.time()-time_start), self.add_protein_to_graph.cache_info())) time0, last_cnt = time.time(), cnts if self.testMode and not \ (protein_id_a in self.test_ids or protein_id_b in self.test_ids): continue matchcounter += 1 taxon_a = self.extract_taxon_info(protein_a) taxon_b = self.extract_taxon_info(protein_b) # check if both protein belong to taxa that are selected if (self.tax_ids is not None and ( (int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) or (int(re.sub(r'NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids))): continue protein_id_a = self.clean_protein_id(protein_id_a) protein_id_b = self.clean_protein_id(protein_id_b) # add genes to graph if needed; # assume labels will be taken care of elsewhere self.add_protein_to_graph(protein_id_a, taxon_a, model) self.add_protein_to_graph(protein_id_b, taxon_b, model) rel = self._map_orthology_code_to_RO[rel_type] evidence_id = 'ECO:0000080' # phylogenetic evidence # add the association and relevant nodes to graph assoc = OrthologyAssoc(g, self.name, protein_id_a, protein_id_b, rel) assoc.add_evidence(evidence_id) assoc.add_association_to_graph() if not self.testMode \ and limit is not None and matchcounter > limit: logger.warning("reached limit of relations to extract. Stopping early...") break # make report on unprocessed_gene_ids logger.info("finished processing %s", f) return
def add_orthologs_by_gene_group(self, graph, gene_ids): """ This will get orthologies between human and other vertebrate genomes based on the gene_group annotation pipeline from NCBI. More information 9can be learned here: http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/ The method for associations is described in [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/) == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/). Because these are only between human and vertebrate genomes, they will certainly miss out on very distant orthologies, and should not be considered complete. We do not run this within the NCBI parser itself; rather it is a convenience function for others parsers to call. :param graph: :param gene_ids: Gene ids to fetch the orthology :return: """ src_key = 'gene_group' LOG.info("getting gene groups") src_file = '/'.join((self.rawdir, self.files[src_key]['file'])) found_counter = 0 # because many of the orthologous groups are grouped by human gene, # we need to do this by generating two-way hash # group_id => orthologs # ortholog id => group # this will be the fastest approach, though not memory-efficient. geno = Genotype(graph) model = Model(graph) group_to_orthology = {} gene_to_group = {} gene_to_taxon = {} col = self.files[src_key]['columns'] with gzip.open(src_file, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip octothorp if not self.check_fileheader(col, row): pass for row in tsv: row = row.decode().strip().split('\t') tax_a = row[col.index('tax_id')] gene_a = row[col.index('GeneID')].strip() rel = row[col.index('relationship')] tax_b = row[col.index('Other_tax_id')] gene_b = row[col.index('Other_GeneID')].strip() if rel != 'Ortholog': continue if gene_a not in group_to_orthology: group_to_orthology[gene_a] = set() group_to_orthology[gene_a].add(gene_b) if gene_b not in gene_to_group: gene_to_group[gene_b] = set() gene_to_group[gene_b].add(gene_a) gene_to_taxon[gene_a] = tax_a gene_to_taxon[gene_b] = tax_b # also add the group lead as a member of the group group_to_orthology[gene_a].add(gene_a) # end loop through gene_group file LOG.debug("Finished hashing gene groups") LOG.debug("Making orthology associations") for gid in gene_ids: gene_num = re.sub(r'NCBIGene:', '', gid) group_nums = gene_to_group.get(gene_num) if group_nums is not None: for group_num in group_nums: orthologs = group_to_orthology.get(group_num) if orthologs is not None: for orth in orthologs: oid = 'NCBIGene:' + str(orth) model.addClassToGraph(oid, None, self.globaltt['gene']) otaxid = 'NCBITaxon:' + str(gene_to_taxon[orth]) geno.addTaxon(otaxid, oid) assoc = OrthologyAssoc(graph, self.name, gid, oid) assoc.add_source('PMID:24063302') assoc.add_association_to_graph() # todo get gene label for orthologs - # this could get expensive found_counter += 1 # finish loop through annotated genes LOG.info("Made %d orthology relationships for %d genes", found_counter, len(gene_ids))
def _get_orthologs(self, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. Relationships are made between genes here. There is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. For example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a hard-coded filter to select only orthology relationships where one of the pair is in our species of interest (Mouse and Human, for the moment). This will be added as a configurable parameter in the future. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence <panther_id> a DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ logger.info("getting orthologs") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) unprocessed_gene_ids = set() # may be faster to make a set after for k in self.files.keys(): f = '/'.join((self.rawdir, self.files[k]['file'])) matchcounter = 0 mytar = tarfile.open(f, 'r:gz') # assume that the first entry is the item fname = mytar.getmembers()[0] logger.info("Parsing %s", fname.name) line_counter = 0 with mytar.extractfile(fname) as csvfile: for line in csvfile: # skip comment lines if re.match(r'^#', line.decode()): logger.info("Skipping header line") continue line_counter += 1 # a little feedback to the user since there's so many if line_counter % 1000000 == 0: logger.info( "Processed %d lines from %s", line_counter, fname.name) line = line.decode().strip() # parse each row. ancestor_taxon is unused # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 # MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 # LDO Euarchontoglires PTHR15964 (a, b, orthology_class, ancestor_taxon, panther_id) = line.split('\t') (species_a, gene_a, protein_a) = a.split('|') (species_b, gene_b, protein_b) = b.split('|') # skip the entries that don't have homolog relationships # with the test ids if self.testMode and not ( re.sub(r'UniProtKB=', '', protein_a) in self.test_ids or re.sub(r'UniProtKB=', '', protein_b) in self.test_ids): continue # map the taxon abbreviations to ncbi taxon ids taxon_a = self._map_taxon_abbr_to_id(species_a) taxon_b = self._map_taxon_abbr_to_id(species_b) # ###uncomment the following code block # if you want to filter based on taxid of favorite animals # taxids = [9606,10090,10116,7227,7955,6239,8355] # taxids = [9606] #human only # retain only those orthologous relationships to genes # in the specified taxids # using AND will get you only those associations where # gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where # gene1 OR gene2 are in the taxid list (some-filter) if ( self.tax_ids is not None and (int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) and (int(re.sub( r'NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids)): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ### end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub(r'=', ':', gene_a) gene_b = re.sub(r'=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.add(gene_a) gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.add(gene_b) gene_b = clean_gene # a special case here; mostly some rat genes # they use symbols instead of identifiers. will skip if gene_a is None or gene_b is None: continue rel = self._map_orthology_code_to_RO(orthology_class) evidence_id = 'ECO:0000080' # phylogenetic evidence # add the association and relevant nodes to graph assoc = OrthologyAssoc(g, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; # assume labels will be taken care of elsewhere model.addClassToGraph(gene_a, None) model.addClassToGraph(gene_b, None) # might as well add the taxon info for completeness g.addTriple( gene_a, model.object_properties['in_taxon'], taxon_a) g.addTriple( gene_b, model.object_properties['in_taxon'], taxon_b) assoc.add_association_to_graph() # note this is incomplete... # it won't construct the full family hierarchy, # just the top-grouping assoc.add_gene_family_to_graph( ':'.join(('PANTHER', panther_id))) if not self.testMode \ and limit is not None and line_counter > limit: break # make report on unprocessed_gene_ids logger.info("finished processing %s", f) logger.warning( "The following gene ids were unable to be processed: %s", str(unprocessed_gene_ids)) return
def add_orthologs_by_gene_group(self, graph, gene_ids): """ This will get orthologies between human and other vertebrate genomes based on the gene_group annotation pipeline from NCBI. More information 9can be learned here: http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/ The method for associations is described in [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/) == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/). Because these are only between human and vertebrate genomes, they will certainly miss out on very distant orthologies, and should not be considered complete. We do not run this within the NCBI parser itself; rather it is a convenience function for others parsers to call. :param graph: :param gene_ids: Gene ids to fetch the orthology :return: """ logger.info("getting gene groups") line_counter = 0 f = '/'.join((self.rawdir, self.files['gene_group']['file'])) found_counter = 0 # because many of the orthologous groups are grouped by human gene, # we need to do this by generating two-way hash # group_id => orthologs # ortholog id => group # this will be the fastest approach, though not memory-efficient. geno = Genotype(graph) model = Model(graph) group_to_orthology = {} gene_to_group = {} gene_to_taxon = {} with gzip.open(f, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: # skip comment lines if re.match(r'\#', ''.join(row)): continue line_counter += 1 (tax_a, gene_a, rel, tax_b, gene_b) = row if rel != 'Ortholog': continue if gene_a not in group_to_orthology: group_to_orthology[gene_a] = set() group_to_orthology[gene_a].add(gene_b) if gene_b not in gene_to_group: gene_to_group[gene_b] = set() gene_to_group[gene_b].add(gene_a) gene_to_taxon[gene_a] = tax_a gene_to_taxon[gene_b] = tax_b # also add the group lead as a member of the group group_to_orthology[gene_a].add(gene_a) # end loop through gene_group file logger.debug("Finished hashing gene groups") logger.debug("Making orthology associations") for gid in gene_ids: gene_num = re.sub(r'NCBIGene:', '', gid) group_nums = gene_to_group.get(gene_num) if group_nums is not None: for group_num in group_nums: orthologs = group_to_orthology.get(group_num) if orthologs is not None: for o in orthologs: oid = 'NCBIGene:'+str(o) model.addClassToGraph( oid, None, Genotype.genoparts['gene']) otaxid = 'NCBITaxon:'+str(gene_to_taxon[o]) geno.addTaxon(otaxid, oid) assoc = OrthologyAssoc(graph, self.name, gid, oid) assoc.add_source('PMID:24063302') assoc.add_association_to_graph() # todo get gene label for orthologs - # this could get expensive found_counter += 1 # finish loop through annotated genes logger.info( "Made %d orthology relationships for %d genes", found_counter, len(gene_ids)) return
def _get_orthologs(self, src_key, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. Relationships are made between genes here. There is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. For example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a filter to select only orthology relationships where each of the pair is found in self.tax_ids. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dcterms:evidence ECO:phylogenetic_evidence <panther_id> rdf:type DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ LOG.info("reading orthologs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) unprocessed_gene_ids = [] src_file = '/'.join((self.rawdir, self.files[src_key]['file'])) matchcounter = line_counter = 0 col = self.files[src_key]['columns'] reader = tarfile.open(src_file, 'r:gz') LOG.info("Parsing %s", src_key) with reader.extractfile(src_key) as csvfile: # there are no comments or headers for line in csvfile: # a little feedback to the user since there's so many ... bah strace # if line_counter % 1000000 == 0: # LOG.info("Processed %d lines from %s", line_counter, fname.name) # parse each row. ancestor_taxons is unused # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 # MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 # LDO Euarchontoglires PTHR15964 row = line.decode().split('\t') thing1 = row[col.index('Gene')].strip() thing2 = row[col.index('Ortholog')].strip() orthology_type = row[col.index('Type of ortholog')].strip() # ancestor_taxons = row[ # col.index('Common ancestor for the orthologs')].strip() panther_id = row[ col.index('Panther Ortholog ID')].strip() (species_a, gene_a, protein_a) = thing1.split('|') (species_b, gene_b, protein_b) = thing2.split('|') # for testing skip entries without homolog relationships to test ids if self.test_mode and not ( protein_a[9:] in self.test_ids or protein_b[9:] in self.test_ids): continue # map the species abbreviations to ncbi taxon id numbers taxon_a = self.resolve(species_a).split(':')[1].strip() taxon_b = self.resolve(species_b).split(':')[1].strip() # ### # keep orthologous relationships to genes in the given tax_ids # using AND will get you only those associations where # gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where # gene1 OR gene2 are in the taxid list (some-filter) if self.tax_ids is not None and ( taxon_a not in self.tax_ids) and ( taxon_b not in self.tax_ids): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ### end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub(r'=', ':', gene_a) gene_b = re.sub(r'=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.append(gene_a) continue gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.append(gene_b) continue gene_b = clean_gene rel = self.resolve(orthology_type) evidence_id = self.globaltt['phylogenetic evidence'] # add the association and relevant nodes to graph assoc = OrthologyAssoc(graph, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; assume labels will be taken care of elsewhere model.addType(gene_a, self.globaltt['gene']) model.addType(gene_b, self.globaltt['gene']) # might as well add the taxon info for completeness graph.addTriple( gene_a, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_a ) graph.addTriple( gene_b, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_b ) assoc.add_association_to_graph( blv.terms['GeneToGeneHomologyAssociation'] ) # note this is incomplete... # it won't construct the full family hierarchy, # just the top-grouping assoc.add_gene_family_to_graph('PANTHER:' + panther_id) if not self.test_mode and\ limit is not None and line_counter > limit: break LOG.info("finished processing %s", src_file) LOG.warning( "The following gene ids were unable to be processed: %s", str(set(unprocessed_gene_ids)))
def _get_orthologs(self, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. relationships are made between genes here. there is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. for example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a hard-coded filter to select only orthology relationships where one of the pair is in our species of interest (Mouse and Human, for the moment). This will be added as a configurable parameter in the future. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence <panther_id> a DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ logger.info("getting orthologs") if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) unprocessed_gene_ids = set() for k in self.files.keys(): f = '/'.join((self.rawdir, self.files[k]['file'])) matchcounter = 0 mytar = tarfile.open(f, 'r:gz') # assume that the first entry is the item fname = mytar.getmembers()[0] logger.info("Parsing %s", fname.name) line_counter = 0 with mytar.extractfile(fname) as csvfile: for line in csvfile: # skip comment lines if re.match('^#', line.decode()): logger.info("Skipping header line") continue line_counter += 1 # a little feedback to the user since there's so many if line_counter % 1000000 == 0: logger.info("Processed %d lines from %s", line_counter, fname.name) line = line.decode().strip() # parse each row # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 LDO Euarchontoglires PTHR15964 (a, b, orthology_class, ancestor_taxon, panther_id) = line.split('\t') (species_a, gene_a, protein_a) = a.split('|') (species_b, gene_b, protein_b) = b.split('|') # skip the entries that don't have homolog relationships with the test ids if self.testMode and not (re.sub('UniProtKB=', '', protein_a) in self.test_ids or re.sub('UniProtKB=', '', protein_b) in self.test_ids): continue # map the taxon abbreviations to ncbi taxon ids taxon_a = self._map_taxon_abbr_to_id(species_a) taxon_b = self._map_taxon_abbr_to_id(species_b) # ###uncomment the following code block if you want to filter based on taxid # taxids = [9606,10090,10116,7227,7955,6239,8355] #our favorite animals # taxids = [9606] #human only # retain only those orthologous relationships to genes in the specified taxids # using AND will get you only those associations where gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where gene1 OR gene2 are in the taxid list (some-filter) if (self.tax_ids is not None and (int(re.sub('NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) and (int(re.sub('NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids)): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ###end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub('=', ':', gene_a) gene_b = re.sub('=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.add(gene_a) gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.add(gene_b) gene_b = clean_gene # a special case here; mostly some rat genes they use symbols instead of identifiers. will skip if gene_a is None or gene_b is None: continue rel = self._map_orthology_code_to_RO(orthology_class) evidence_id = 'ECO:0000080' # phylogenetic evidence # add the association and relevant nodes to graph assoc = OrthologyAssoc(self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; assume labels will be taken care of elsewhere gu.addClassToGraph(g, gene_a, None) gu.addClassToGraph(g, gene_b, None) assoc.add_association_to_graph(g) # note this is incomplete... it won't construct the full family hierarchy, just the top-grouping assoc.add_gene_family_to_graph(g, ':'.join(('PANTHER', panther_id))) if not self.testMode and limit is not None and line_counter > limit: break logger.info("finished processing %s", f) logger.warn("The following gene ids were unable to be processed: %s", str(unprocessed_gene_ids)) gu.loadProperties(g, OrthologyAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, OrthologyAssoc.annotation_properties, gu.ANNOTPROP) return
def add_orthologs_by_gene_group(self, graph, gene_ids): """ This will get orthologies between human and other vertebrate genomes based on the gene_group annotation pipeline from NCBI. More information 9can be learned here: http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/ The method for associations is described in [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/) == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/). Because these are only between human and vertebrate genomes, they will certainly miss out on very distant orthologies, and should not be considered complete. We do not run this within the NCBI parser itself; rather it is a convenience function for others parsers to call. :param graph: :param gene_ids: Gene ids to fetch the orthology :return: """ src_key = 'gene_group' LOG.info("getting gene groups") src_file = '/'.join((self.rawdir, self.files[src_key]['file'])) found_counter = 0 # because many of the orthologous groups are grouped by human gene, # we need to do this by generating two-way hash # group_id => orthologs # ortholog id => group # this will be the fastest approach, though not memory-efficient. geno = Genotype(graph) model = Model(graph) group_to_orthology = {} gene_to_group = {} gene_to_taxon = {} col = self.files[src_key]['columns'] with gzip.open(src_file, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip octothorp if not self.check_fileheader(col, row): pass for row in tsv: row = row.decode().strip().split('\t') tax_a = row[col.index('tax_id')] gene_a = row[col.index('GeneID')] rel = row[col.index('relationship')] tax_b = row[col.index('Other_tax_id')] gene_b = row[col.index('Other_GeneID')] if rel != 'Ortholog': continue if gene_a not in group_to_orthology: group_to_orthology[gene_a] = set() group_to_orthology[gene_a].add(gene_b) if gene_b not in gene_to_group: gene_to_group[gene_b] = set() gene_to_group[gene_b].add(gene_a) gene_to_taxon[gene_a] = tax_a gene_to_taxon[gene_b] = tax_b # also add the group lead as a member of the group group_to_orthology[gene_a].add(gene_a) # end loop through gene_group file LOG.debug("Finished hashing gene groups") LOG.debug("Making orthology associations") for gid in gene_ids: gene_num = re.sub(r'NCBIGene:', '', gid) group_nums = gene_to_group.get(gene_num) if group_nums is not None: for group_num in group_nums: orthologs = group_to_orthology.get(group_num) if orthologs is not None: for orth in orthologs: oid = 'NCBIGene:' + str(orth) model.addClassToGraph(oid, None, self.globaltt['gene']) otaxid = 'NCBITaxon:' + str(gene_to_taxon[orth]) geno.addTaxon(otaxid, oid) assoc = OrthologyAssoc(graph, self.name, gid, oid) assoc.add_source('PMID:24063302') assoc.add_association_to_graph() # todo get gene label for orthologs - # this could get expensive found_counter += 1 # finish loop through annotated genes LOG.info( "Made %d orthology relationships for %d genes", found_counter, len(gene_ids))