class ZFINTestCase(SourceTestCase): def setUp(self): self.source = ZFIN('rdf_graph', True) self.source.settestonly(True) self._setDirToSource() return def tearDown(self): self.source = None return @unittest.skip( 'Will eventually write test to check if phenotype sextuples' + 'are mapped to ZP ids') def test_allZPAvailable(self): """ This test will identify if there are any missing ZP terms in the mapping file :return: """ # TODO add this test to check if all phenotype sextuples # are mapped to ZP ids return
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) zp_file = '/'.join((self.rawdir, self.files['zpmap']['file'])) g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file'])) zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file) with open(g2p_file, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (internal_id, symbol, gene_id, subterm1_id, subterm1_label, pc_rel_id, pc_rel_label, superterm1_id, superterm1_label, quality_id, quality_name, modifier, subterm2_id, subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id, superterm2_label, fish_id, fish_label, start_stage, end_stage, environment, pub_id, figure_id, unknown_field) = row zp_id = zfin_parser._map_sextuple_to_phenotype( superterm1_id, subterm1_id, quality_id, superterm2_id, subterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, Reference.ref_types['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence('ECO:0000059') assoc.add_association_to_graph()
def __init__(self, graph_type, are_bnodes_skolemized, data_release_version=None, tax_ids=None): super().__init__( graph_type=graph_type, are_bnodes_skized=are_bnodes_skolemized, data_release_version=data_release_version, name='go', ingest_title='Gene Ontology', ingest_url='http://www.geneontology.org', ingest_logo='source-geneontology.png', license_url=None, data_rights='http://geneontology.org/page/use-and-license' # file_handle=None ) self.test_ids = [] # note: dipper-etl defaults tax_ids to '9606' # note: sorting tax_ids for stable digest if tax_ids is not None and [] != set(tax_ids).difference(['9606']): LOG.info('Have %s given as taxon to ingest', str(tax_ids)) self.tax_ids = sorted([str(x) for x in tax_ids]) nottax = set(tax_ids) - set(self.files.keys()) if nottax: LOG.error('Cant process taxon number(s):\t%s', str(nottax)) self.tax_ids = list(set(self.tax_ids) - nottax) else: self.tax_ids = sorted(['9606', '10090', '7955']) LOG.info("Filtering to the following taxa: %s", self.tax_ids) # moving this from process_gaf() to avoid repeating this for each # file to be processed. if '7955' in self.tax_ids: self.zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if '6239' in self.tax_ids: self.wbase = WormBase(self.graph_type, self.are_bnodes_skized) if 'gene' not in self.all_test_ids: LOG.warning("not configured with gene test ids.") else: self.test_ids = self.all_test_ids['gene'] # build the id map for mapping uniprot ids to genes ... ONCE self.uniprot_entrez_id_map = self.get_uniprot_entrez_id_map() # gaf evidence code mapping is built in parse(), after the file is fetched. self.gaf_eco = {}
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if '7955' in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if '6239' in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not (re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def setUp(self): self.source = ZFIN('rdf_graph', True) self.source.settestonly(True) self._setDirToSource() return
class GeneOntology(Source): """ This is the parser for the [Gene Ontology Annotations](http://www.geneontology.org), from which we process gene-process/function/subcellular location associations. We generate the GO graph to include the following information: * genes * gene-process * gene-function * gene-location We process only a subset of the organisms: Status: IN PROGRESS / INCOMPLETE """ gaf_columns = [ # GAF2.1 files contain the following columns: 'DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence Code', 'With (or) From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'Taxon and Interacting taxon', 'Date', 'Assigned_By', 'Annotation_Extension', 'Gene_Product_Form_ID' ] files = { '9615': { # Canis lupus familiaris 'file': 'goa_dog.gaf.gz', 'url': GOGA + '/goa_dog.gaf.gz', 'columnns': gaf_columns }, '7227': { # Drosophila melanogaster 'file': 'fb.gaf.gz', 'url': GOGA + '/fb.gaf.gz', 'columnns': gaf_columns }, '7955': { # Danio rerio 'file': 'zfin.gaf.gz', 'url': GOGA + '/zfin.gaf.gz', 'columnns': gaf_columns }, '10090': { # Mus musculus 'file': 'mgi.gaf.gz', 'url': GOGA + '/mgi.gaf.gz', 'columnns': gaf_columns }, '10116': { # Rattus norvegicus 'file': 'rgd.gaf.gz', 'url': GOGA + '/rgd.gaf.gz', 'columnns': gaf_columns }, '6239': { # Caenorhabditis elegans 'file': 'wb.gaf.gz', 'url': GOGA + '/wb.gaf.gz', 'columnns': gaf_columns }, '9823': { # Sus scrofa 'file': 'goa_pig.gaf.gz', 'url': GOGA + '/goa_pig.gaf.gz', 'columnns': gaf_columns }, '9031': { # Gallus gallus 'file': 'goa_chicken.gaf.gz', 'url': GOGA + '/goa_chicken.gaf.gz', 'columnns': gaf_columns }, '9606': { # H**o sapiens 'file': 'goa_human.gaf.gz', 'url': GOGA + '/goa_human.gaf.gz', 'columnns': gaf_columns }, '9913': { # Bos taurus 'file': 'goa_cow.gaf.gz', 'url': GOGA + '/goa_cow.gaf.gz', 'columnns': gaf_columns }, '559292': { # Saccharomyces cerevisiae 4932 'file': 'sgd.gaf.gz', 'url': GOGA + '/sgd.gaf.gz', 'columnns': gaf_columns }, '4896': { # Schizosaccharomyces pombe (yeast) 'file': 'pombase.gaf.gz', 'url': GOGA + '/pombase.gaf.gz', 'columnns': gaf_columns }, '5782': { # Dictyostelium (slime mold genus) 'file': 'dictibase.gaf.gz', 'url': GOGA + '/dictybase.gaf.gz', 'columnns': gaf_columns }, '5052': { # Aspergillus (fungi) http://www.aspergillusgenome.org/ 'file': 'aspgd.gaf.gz', 'url': GOGA + '/aspgd.gaf.gz', 'columnns': gaf_columns }, # consider this after most others - should this be part of GO? # 'multispecies': { # 'file': 'gene_association.goa_uniprot.gz', # 'url': FTPEBI + 'GO/goa/UNIPROT/gene_association.goa_uniprot.gz'}, 'go-references': { 'file': 'GO.references', # Quoth the header of this file: "This file is DEPRECATED. # Please see go-refs.json relative to this location" # (http://current.geneontology.org/metadata/go-refs.json) 'url': 'http://www.geneontology.org/doc/GO.references' }, 'id-map': { # 5GB mapping file takes 6 hours to DL ... maps UniProt to Ensembl 'file': 'idmapping_selected.tab.gz', 'url': FTPEBI + UPCRKB + 'idmapping/idmapping_selected.tab.gz', # ftp://ftp.uniprot.org # /pub/databases/uniprot/current_release/knowledgebase/idmapping/README 'columns': [ 'UniProtKB-AC', 'UniProtKB-ID', 'GeneID (EntrezGene)', 'RefSeq', 'GI', 'PDB', 'GO', 'UniRef100', 'UniRef90', 'UniRef50', 'UniParc', 'PIR', 'NCBI-taxon', 'MIM', 'UniGene', 'PubMed', 'EMBL', 'EMBL-CDS', 'Ensembl', 'Ensembl_TRS', 'Ensembl_PRO', 'Additional PubMed' ] } } # consider moving the go-ref and id-map above to here in map_files map_files = { 'eco_map': 'http://purl.obolibrary.org/obo/eco/gaf-eco-mapping.txt', } def __init__(self, graph_type, are_bnodes_skolemized, data_release_version=None, tax_ids=None): super().__init__( graph_type=graph_type, are_bnodes_skized=are_bnodes_skolemized, data_release_version=data_release_version, name='go', ingest_title='Gene Ontology', ingest_url='http://www.geneontology.org', ingest_logo='source-geneontology.png', license_url=None, data_rights='http://geneontology.org/page/use-and-license' # file_handle=None ) self.test_ids = [] # note: dipper-etl defaults tax_ids to '9606' # note: sorting tax_ids for stable digest if tax_ids is not None and [] != set(tax_ids).difference(['9606']): LOG.info('Have %s given as taxon to ingest', str(tax_ids)) self.tax_ids = sorted([str(x) for x in tax_ids]) nottax = set(tax_ids) - set(self.files.keys()) if nottax: LOG.error('Cant process taxon number(s):\t%s', str(nottax)) self.tax_ids = list(set(self.tax_ids) - nottax) else: self.tax_ids = sorted(['9606', '10090', '7955']) LOG.info("Filtering to the following taxa: %s", self.tax_ids) # moving this from process_gaf() to avoid repeating this for each # file to be processed. if '7955' in self.tax_ids: self.zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if '6239' in self.tax_ids: self.wbase = WormBase(self.graph_type, self.are_bnodes_skized) if 'gene' not in self.all_test_ids: LOG.warning("not configured with gene test ids.") else: self.test_ids = self.all_test_ids['gene'] # build the id map for mapping uniprot ids to genes ... ONCE self.uniprot_entrez_id_map = self.get_uniprot_entrez_id_map() self.eco_map = self.get_eco_map(self.map_files['eco_map']) def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) def parse(self, limit=None): if limit is not None: LOG.info("Only parsing first %s rows of each file", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True for txid_num in list(set(self.files).intersection(self.tax_ids)): gaffile = '/'.join((self.rawdir, self.files[txid_num]['file'])) self.process_gaf(gaffile, limit, self.uniprot_entrez_id_map, self.eco_map) LOG.info("Finished parsing.") def process_gaf(self, gaffile, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple( gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None: LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) model.addSynonym(gene_id, syn) else: model.addSynonym(gene_id, syn) for txid in taxon.split('|'): tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid) geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = with_or_from.split('|') phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning( "Skipping %s from or with %s", uniprotid, itm) continue itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm) itm = re.sub(r'WB:', 'WormBase:', itm) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) def get_uniprot_entrez_id_map(self): src_key = 'id-map' taxon_digest = GraphUtils.digest_id(str(self.tax_ids)) id_map = {} smallfile = '/'.join((self.rawdir, 'id_map_' + taxon_digest + '.yaml')) bigfile = '/'.join((self.rawdir, self.files[src_key]['file'])) # if processed smallfile exists and is newer than bigfile then use it instesd if os.path.isfile(smallfile) and \ os.path.getctime(smallfile) > os.path.getctime(bigfile): LOG.info("Using the cheap mapping file %s", smallfile) with open(smallfile, 'r') as yamlreader: id_map = yaml.safe_load(yamlreader) else: LOG.info( "Expensive Mapping from Uniprot IDs to Entrez/ENSEMBL gene ids for %s", self.tax_ids) self.fetch_from_url(self.files[src_key]['url'], bigfile) col = self.files[src_key]['columns'] ummapped_uniprot = 0 with gzip.open(bigfile, 'rb') as csvfile: csv.field_size_limit(sys.maxsize) reader = csv.reader( # warning this file is over 10GB unzipped io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: uniprotkb_ac = row[col.index('UniProtKB-AC')].strip() # uniprotkb_id = row[col.index('UniProtKB-ID')] geneid = row[col.index('GeneID (EntrezGene)')].strip() # refseq = row[col.index('RefSeq')] # gi = row[col.index('GI')] # pdb = row[col.index('PDB')] # go = row[col.index('GO')] # uniref100 = row[col.index('UniRef100')] # unifref90 = row[col.index('UniRef90')] # uniref50 = row[col.index('UniRef50')] # uniparc = row[col.index('UniParc')] # pir = row[col.index('PIR')] ncbitaxon = row[col.index('NCBI-taxon')].strip() # mim = row[col.index('MIM')] # unigene = row[col.index('UniGene')] # pubmed = row[col.index('PubMed')] # embl = row[col.index('EMBL')] # embl_cds = row[col.index('EMBL-CDS')] ensembl = row[col.index('Ensembl')].strip() # ensembl_trs = row[col.index('Ensembl_TRS')] # ensembl_pro = row[col.index('Ensembl_PRO')] # other_pubmed = row[col.index('Additional PubMed')] if ncbitaxon not in self.tax_ids: continue # neither empty nor a list if geneid != '' and ';' not in geneid: id_map[uniprotkb_ac] = 'NCBIGene:' + geneid elif ensembl != '' and ';' not in ensembl: id_map[uniprotkb_ac] = 'ENSEMBL:' + ensembl else: ummapped_uniprot += 1 LOG.info("Writing id_map out as %s", smallfile) with open(smallfile, 'w') as yamlwriter: yaml.dump(id_map, yamlwriter) LOG.warning('Did not find 1:1 gene IDs for %i uniprots', ummapped_uniprot) LOG.info( "Acquired %i 1:1 uniprot to [entrez|ensembl] mappings", len(id_map.keys())) return id_map def getTestSuite(self): import unittest from tests.test_geneontology import GeneOntologyTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(GeneOntologyTestCase) return test_suite
def setUp(self): self.source = ZFIN() self.source.settestonly(True) self.source.setnobnodes(True) self._setDirToSource() return
def setUp(self): self.source = ZFIN('rdf_graph', True) self.source.settestonly(True) self._setDirToSource()
class ZFINTestCase(SourceTestCase): def setUp(self): self.source = ZFIN('rdf_graph', True) self.source.settestonly(True) self._setDirToSource() def tearDown(self): self.source = None def test_mapping_of_phenotypes_to_zp_ids(self): """ test that code correctly uses zp_map to map phenotypes to zp ids :return: """ mapping_file = "./tests/resources/zfin/zp-mapping-test-map.txt" pheno_file = "./tests/resources/zfin/zp-mapping-test-phenotype.txt" self.source.zp_map = self.source._load_zp_mappings(mapping_file) pheno_dat = open(pheno_file).read().split('\t') (fish_num, fish_name, start_stage_id, start_stage_name, end_stage_id, end_stage_name, subterm1_id, subterm1_name, postcomp1_rel_id, postcomp1_rel_name, superterm1_id, superterm1_name, quality_id, quality_name, modifier, subterm2_id, subterm2_name, postcomp2_rel_id, postcomp2_rel_name, superterm2_id, superterm2_name, pub_id, env_id) = \ pheno_dat self.assertEqual( self.source._map_octuple_to_phenotype( subterm1_id, postcomp1_rel_id, superterm1_id, quality_id, subterm2_id, postcomp2_rel_id, superterm2_id, "abnormal"), 'ZP:0022140') def test_load_zp_mappings(self): """ test correct loading of zp mappings file and construction of zp_map """ if self.source is not None: try: zp_map = self.source._load_zp_mappings( "./tests/resources/zfin/zp-mapping-test.txt") self.assertIsInstance( zp_map, dict, "_load_zp_mappings() didn't return dict!") self.assertTrue( len(zp_map) == 1, "_load_zp_mappings() didn't return exactly one thing!") self.assertDictEqual( zp_map, { 'MONARCH:b308a8f1c67793a56d16': { 'post_composed_relationship_id_1': 'BFO:0000050', 'post_composed_relationship_id_2': 'BFO:0000050', 'quality_id': 'PATO:0001453', 'subterm1_id': 'ZFA:0009114', 'subterm2_id': 'GO:0005927', 'superterm1_id': 'ZFA:0001056', 'superterm2_id': 'ZFA:0001056', 'zp_id': 'ZP:0002959', 'modifier': 'PATO:0000460' } }, "_load_zp_mappings() " + "didn't return what I expected!") except Exception as t_except: LOGGER.error(t_except) def test_make_zpkey(self): """ test that _make_zpkey returns correct id """ if self.source is not None: try: dummy_args = list(map(str, list(range(1, 9)))) # 1 - 8 as strings expected_key = self.source.make_id("_".join(dummy_args)) self.assertEqual(self.source._make_zpkey(*dummy_args), expected_key) self.assertEqual( self.source._make_zpkey(['0'] * 8), self.source._make_zpkey([''] * 8), "_make_zpkey() doesn't seem to be replacing empty " + "strings with zeros before making key," + "this might cause zp_map lookup issues") except Exception as t_except: LOGGER.error(t_except) def test_genotype_labels(self): """ test that genotype label is set correctly after parse() """ if self.source is not None: test_resource_dir = "../../tests/resources/zfin/" self.source.files['fish_components']['file'] = test_resource_dir + \ "genotype-label-test-fish_components_fish.txt" self.source.files['backgrounds']['file'] = test_resource_dir + \ "genotype-label-test-genotype_backgrounds.txt" self.source.files['geno']['file'] = test_resource_dir + \ "genotype-label-test-genotype_features.txt" self.source.parse() this_iri = URIRef("http://zfin.org/ZDB-GENO-070228-3") expect_genotype_label = "shha<sup>tbx392/tbx392</sup> (AB)" self.assertEqual(str(self.source.testgraph.label(this_iri, None)), expect_genotype_label)
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n"+'\t' .join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info( ">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id(gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n"+'\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from '+uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc( g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub( prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) src_key = 'zpmap' # keep same-as zfin.files[key] zfin_parser.zp_map = zfin_parser._load_zp_mappings(src_key) src_key = 'g2p_clean' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing clean Geno to Pheno from file: %s", raw) col = self.files[src_key]['columns'] collen = len(col) with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: if len(row) != collen: LOG.warning('Row: %i has unexpected format', reader.line_num) # internal_id = row[col.index('ID')] # symbol = row[col.index('Gene Symbol')] gene_id = row[col.index('Gene ID')] subterm1_id = row[col.index( 'Affected Structure or Process 1 subterm ID')] # subterm1_label = row[col.index( # 'Affected Structure or Process 1 subterm Name')] pc_rel_id = row[col.index( 'Post-composed Relationship ID')].strip() # pc_rel_label = row[col.index('Post-composed Relationship Name')] superterm1_id = row[col.index( 'Affected Structure or Process 1 superterm ID')].strip() # superterm1_label = row[col.index( # 'Affected Structure or Process 1 superterm Name')] quality_id = row[col.index('Phenotype Keyword ID')].strip() # quality_name = row[col.index('Phenotype Keyword Name')] modifier = row[col.index('Phenotype Tag')].strip() subterm2_id = row[col.index( 'Affected Structure or Process 2 subterm ID')].strip() # subterm2_label = row[col.index( # 'Affected Structure or Process 2 subterm name')] pc_rel2_id = row[col.index( 'Post-composed Relationship (rel) ID')] # pc_rel2_label = row[col.index( # 'Post-composed Relationship (rel) Name')] superterm2_id = row[col.index( 'Affected Structure or Process 2 superterm ID')].strip() # superterm2_label = row[col.index( # 'Affected Structure or Process 2 superterm name')] # fish_id = row[col.index('Fish ID')] # fish_label = row[col.index('Fish Display Name')] start_stage = row[col.index('Start Stage ID')] # end_stage = row[col.index('End Stage ID')] # environment = row[col.index('Fish Environment ID')] pub_id = row[col.index('Publication ID')].strip() # figure_id = row[col.index('Figure ID')] if modifier != 'abnormal': LOG.warning( "skipping phenotype with modifier %s != abnormal ", modifier) continue zp_id = zfin_parser._map_octuple_to_phenotype( subterm1_id, pc_rel_id, superterm1_id, quality_id, subterm2_id, pc_rel2_id, superterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, self.globaltt['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph()