def __init__(self, graph_type, are_bnodes_skolemized): super().__init__( graph_type, are_bnodes_skolemized, 'decipher', ingest_title='Development Disorder Genotype Phenotype Database', ingest_url='https://decipher.sanger.ac.uk/', license_url='https://decipher.sanger.ac.uk/legal', data_rights='https://decipher.sanger.ac.uk/datasharing', # file_handle=None ) if 'disease' not in self.all_test_ids: LOG.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = self.all_test_ids['disease'] self.graph = self.graph self.geno = Genotype(self.graph) self.model = Model(self.graph) self.graph_type = graph_type self.are_bnodes_skolemized = are_bnodes_skolemized return
def _add_snp_gene_relation(self, snp_id, snp_gene_nums, upstream_gene_num, downstream_gene_num): if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for s in re.split(r',', snp_gene_nums): s = s.strip() # still have to test for this, # because sometimes there's a leading comma if s != '': gene_id = 'NCBIGene:' + s geno.addAffectedLocus(snp_id, gene_id) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'NCBIGene:' + downstream_gene_num g.addTriple( snp_id, Feature.object_properties[ r'upstream_of_sequence_of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'NCBIGene:' + upstream_gene_num g.addTriple( snp_id, Feature.object_properties[ 'downstream_of_sequence_of'], upstream_gene_id)
def _add_snp_gene_relation(self, snp_id, snp_gene_nums, upstream_gene_num, downstream_gene_num): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for geneid in re.split(r',', snp_gene_nums): geneid = geneid.strip() # still have to test for this, # because sometimes there's a leading comma if geneid != '': geno.addAffectedLocus(snp_id, 'ENSEMBL:' + geneid) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'ENSEMBL:' + downstream_gene_num graph.addTriple(snp_id, self.globaltt['is upstream of sequence of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'ENSEMBL:' + upstream_gene_num graph.addTriple(snp_id, self.globaltt['is downstream of sequence of'], upstream_gene_id)
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.g = self.graph self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) return
def _add_snp_gene_relation( self, snp_id, snp_gene_nums, upstream_gene_num, downstream_gene_num): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for geneid in re.split(r',', snp_gene_nums): geneid = geneid.strip() # still have to test for this, # because sometimes there's a leading comma if geneid != '': geno.addAffectedLocus(snp_id, 'NCBIGene:' + geneid) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'NCBIGene:' + downstream_gene_num graph.addTriple( snp_id, self.globaltt['is upstream of sequence of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'NCBIGene:' + upstream_gene_num graph.addTriple( snp_id, self.globaltt['is downstream of sequence of'], upstream_gene_id)
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing disease models") geno = Genotype(g, self.nobnodes) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:'+gene_num # make a variant of the gene vl = '_'+'-'.join((gene_num, 'unspecified')) if self.nobnodes: vl = ':'+vl vl_label = 'some variant of '+gene_symbol geno.addAlleleOfGene(vl, gene_id) animal_id = geno.make_experimental_model_with_genotype( g, vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc( self.name, animal_id, disease_id, gu.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) return
def _build_gene_disease_model(self, gene_id, relation_id, disease_id, variant_label, consequence_predicate=None, consequence_id=None, allelic_requirement=None, pmids=None): """ Builds gene variant disease model :return: None """ model = Model(self.graph) geno = Genotype(self.graph) pmids = [] if pmids is None else pmids is_variant = False variant_or_gene = gene_id variant_id_string = variant_label variant_bnode = self.make_id(variant_id_string, "_") if consequence_predicate is not None \ and consequence_id is not None: is_variant = True model.addTriple(variant_bnode, consequence_predicate, consequence_id) # Hack to add labels to terms that # don't exist in an ontology if consequence_id.startswith(':'): model.addLabel(consequence_id, consequence_id.strip(':').replace('_', ' ')) if is_variant: variant_or_gene = variant_bnode # Typically we would type the variant using the # molecular consequence, but these are not specific # enough for us to make mappings (see translation table) model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id, relation_id) assoc.source = pmids assoc.add_association_to_graph() if allelic_requirement is not None and is_variant is False: model.addTriple(assoc.assoc_id, self.globaltt['has_allelic_requirement'], allelic_requirement) if allelic_requirement.startswith(':'): model.addLabel( allelic_requirement, allelic_requirement.strip(':').replace('_', ' '))
def _get_process_allelic_variants(self, entry, g): gu = GraphUtils(curie_map.get()) geno = Genotype(g) du = DipperUtil() if entry is not None: publist = {} # to hold the entry-specific publication mentions for the allelic variants entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall('\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num), geno.object_properties['is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() gu.addIndividualToGraph(g, did, None) gu.addEquivalentClass(g, al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1 rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum gu.addXref(g, al_id, rid) gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4)) elif re.search('moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] gu.addDeprecatedIndividual(g, al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph else: g = self.graph tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(tmap, limit) geno = Genotype(g) # organisms = ['chicken'] organisms = [ 'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for o in organisms: tax_id = self._get_tax_by_common_name(o) geno.addGenome(tax_id, o) build_id = None build = None k = o+'_bp' if k in self.files: file = self.files[k]['file'] m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file) if m is None: logger.error("Can't match a gff build") else: build = m.group(1) build_id = self._map_build_by_abbrev(build) logger.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, tax_id) if build_id is not None: self._process_QTLs_genomic_location( '/'.join((self.rawdir, file)), tax_id, build_id, build, limit) k = o+'_cm' if k in self.files: file = self.files[k]['file'] self._process_QTLs_genetic_location( '/'.join((self.rawdir, file)), tax_id, o, limit) logger.info("Finished parsing") self.load_bindings() logger.info("Found %d nodes", len(self.graph)) return
def _process_genes(self, taxid, limit=None): gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: logger.error("Data error for file %s", raw) return (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene) = row[0:5] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[5] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:'+ensembl_gene_id if description == '': description = None gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None gu.addClassToGraph( g, gene_id, external_gene_name, gene_type_id, description) if entrezgene != '': gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene) if hgnc_id is not None and hgnc_id != '': gu.addEquivalentClass(g, gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if not self.testMode \ and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing disease models") geno = Genotype(g) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:' + gene_num # make a variant of the gene vl = '_:' + '-'.join((gene_num, 'unspecified')) vl_label = 'some variant of ' + gene_symbol geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) animal_id = geno.make_experimental_model_with_genotype( vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc(g, self.name, animal_id, disease_id, model.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph() return
def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph) self.cutil = CurieUtil(self.curie_map) self.test_cat_pred = self.cutil.get_uri(blv.terms['category']) self.test_cat_genotype_category = self.cutil.get_uri( blv.terms['Genotype']) self.test_cat_background_category = self.cutil.get_uri( blv.terms['PopulationOfIndividualOrganisms'])
def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num, disorder_label, phene_key): geno = Genotype(g) model = Model(g) disorder_id = ':'.join(('OMIM', disorder_num)) rel_id = model.object_properties['has_phenotype'] # default rel_label = 'causes' if re.match(r'\[', disorder_label): rel_id = model.object_properties['is_marker_for'] rel_label = 'is a marker for' elif re.match(r'\{', disorder_label): rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' elif re.match(r'\?', disorder_label): # this is a questionable mapping! skip? rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' evidence = self._map_phene_mapping_code_to_eco(phene_key) # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. # so we make an anonymous alternate locus, # and put that in the association. # but we only need to do that in the cases when it's not an NCBIGene # (as that is a sequence feature itself) if re.match(r'OMIM:', gene_id): alt_locus = '_:' + re.sub(r':', '', gene_id) + '-' + disorder_num + 'VL' alt_label = gene_symbol.strip() if alt_label is not None and alt_label != '': alt_label = \ ' '.join(('some variant of', alt_label, 'that', rel_label, disorder_label)) else: alt_label = None model.addIndividualToGraph(alt_locus, alt_label, Genotype.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) else: # assume it's already been added alt_locus = gene_id assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id) assoc.add_evidence(evidence) assoc.add_association_to_graph() return
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing disease models") geno = Genotype(graph) line_counter = 0 worm_taxon = self.globaltt['Caenorhabditis elegans'] with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.test_mode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:'+gene_num # make a variant of the gene vl = '_:'+'-'.join((gene_num, 'unspecified')) vl_label = 'some variant of '+gene_symbol geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) animal_id = geno.make_experimental_model_with_genotype( vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc( graph, self.name, animal_id, disease_id, self.globaltt['is model of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) assoc.add_evidence(self.resolve(eco_symbol)) assoc.add_association_to_graph() return
def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num, disorder_label, phene_key): geno = Genotype(g) model = Model(g) disorder_id = ':'.join(('OMIM', disorder_num)) rel_id = model.object_properties['has_phenotype'] # default rel_label = 'causes' if re.match(r'\[', disorder_label): rel_id = model.object_properties['is_marker_for'] rel_label = 'is a marker for' elif re.match(r'\{', disorder_label): rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' elif re.match(r'\?', disorder_label): # this is a questionable mapping! skip? rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' evidence = self._map_phene_mapping_code_to_eco(phene_key) # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. # so we make an anonymous alternate locus, # and put that in the association. # but we only need to do that in the cases when it's not an NCBIGene # (as that is a sequence feature itself) if re.match(r'OMIM:', gene_id): alt_locus = '_:'+re.sub(r':', '', gene_id)+'-'+disorder_num+'VL' alt_label = gene_symbol.strip() if alt_label is not None and alt_label != '': alt_label = \ ' '.join(('some variant of', alt_label, 'that', rel_label, disorder_label)) else: alt_label = None model.addIndividualToGraph( alt_locus, alt_label, Genotype.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) else: # assume it's already been added alt_locus = gene_id assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id) assoc.add_evidence(evidence) assoc.add_association_to_graph() return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omimTitles file, excludes those designated as obsolete and iteratively queries the omim api in batches of 20 for the json-formatted data. This will create OMIM classes, with the label & definition. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: """ omimids = list(self.omim_type.keys() - self.omim_replaced.keys()) LOG.info('Have %i omim numbers to fetch records from their API', len(omimids)) LOG.info('Have %i omim types ', len(self.omim_type)) if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) tax_label = 'H**o sapiens' tax_id = self.globaltt[tax_label] # add genome and taxon geno.addGenome(tax_id, tax_label) model.addClassToGraph(tax_id, tax_label) includes = set() includes.add('all') self.process_entries(omimids, self._transform_entry, includes, graph, limit) # since we are not fetching obsolete records any more add them all in here for omim_id in self.omim_replaced: model.addDeprecatedClass( 'OMIM:' + omim_id, ['OMIM:' + o for o in self.omim_replaced[omim_id]])
def _process_gene_row(self, row): model = Model(self.graph) geno = Genotype(self.graph) if self.test_mode and row['gene_id'] not in self.test_ids['gene']: return gene_id = 'NCBIGene:' + str(row['gene_id']) self.id_hash['gene'][row['gene_id']] = gene_id gene_label = row['symbol'] self.label_hash[gene_id] = gene_label tax_id = 'NCBITaxon:' + str(row['gb_species_id']) if row['gene_type'] is not None: gene_type_id = self.resolve(row['gene_type']) model.addClassToGraph(gene_id, gene_label, gene_type_id) geno.addTaxon(tax_id, gene_id)
def _process_phene_gene_row(self, row): geno = Genotype(self.g) model = Model(self.g) gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.testMode and not ( omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene']) or\ gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: logger.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d vl = '_:'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL' geno.addAllele(vl, 'some variant of ' + gene_label) geno.addAlleleOfGene(vl, gene_id) geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) assoc = G2PAssoc(self.g, self.name, vl, phene_id) assoc.add_association_to_graph() # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id) return
def _process_phene_gene_row(self, row): geno = Genotype(self.graph) model = Model(self.graph) gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.test_mode and not (omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene'] ) or gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: LOG.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d var = self.make_id(gene_id.split(':')[-1] + 'VL', '_') geno.addAllele(var, 'some variant of ' + gene_label) geno.addAlleleOfGene(var, gene_id) geno.addAffectedLocus(var, gene_id) model.addBlankNodeAnnotation(var) assoc = G2PAssoc(self.graph, self.name, var, phene_id) assoc.add_association_to_graph() # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id)
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing: %s", self.files['gene_ids']['file']) line_counter = 0 geno = Genotype(g) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live, gene_type) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:'+taxon_num gene_id = 'WormBase:'+gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None model.addClassToGraph( gene_id, gene_symbol, Genotype.genoparts['gene']) if live == 'Dead': model.addDeprecatedClass(gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '' and gene_synonym is not None: model.addSynonym(gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) src_key = 'chemical_disease_associations' self._parse_ctd_file(limit, src_key) # self._parse_ctd_file(limit, 'gene_pathway') # self._parse_ctd_file(limit, 'gene_disease') src_key = 'publications' file_path = '/'.join((self.rawdir, self.api_fetch[src_key]['file'])) if os.path.exists(file_path) is True: self._parse_curated_chem_disease(file_path, limit) else: LOG.error('Batch Query file "%s" does not exist', file_path) LOG.info("Done parsing files.")
def __init__(self): Source.__init__(self, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) self.g = self.graph self.geno = Genotype(self.g) return
def __init__(self): Source.__init__(self, 'mpd') # @N, not sure if this step is required self.namespaces.update(curie_map.get()) self.stdevthreshold = 2 self.nobnodes = True # FIXME # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} self.geno = Genotype(self.graph) self.gu = GraphUtils(curie_map.get()) return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__( graph_type, are_bnodes_skolemized, 'ctd', ingest_title='Comparative Toxicogenomics Database', ingest_url='http://ctdbase.org', license_url=None, data_rights='http://ctdbase.org/about/legal.jsp' # file_handle=None ) if 'gene' not in self.all_test_ids: LOG.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = self.all_test_ids['gene'] if 'disease' not in self.all_test_ids: LOG.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = self.all_test_ids['disease'] self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) return
def process_gene_ids(self, limit): src_key = 'gene_ids' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) graph = self.graph model = Model(graph) geno = Genotype(graph) col = self.files[src_key]['columns'] LOG.info("Processing: %s", self.files[src_key]['file']) with gzip.open(raw, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') # no header row to check collen = len(col) for row in reader: if len(row) != collen: LOG.error('In %s line %i expected %i colums but got %s.', self.files[src_key]['file'], reader.line_num, collen, row) pass taxon_num = row[col.index('taxon_num')] gene_num = row[col.index('gene_num')] gene_symbol = row[col.index('gene_symbol')] gene_synonym = row[col.index('gene_synonym')] live = row[col.index('live')] # gene_type = row[col.index('gene_type')] # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene taxon_curie = 'NCBITaxon:' + taxon_num gene_curie = 'WormBase:' + gene_num if gene_symbol == '': gene_symbol = gene_synonym # these are not the same in my book tec. if gene_symbol == '': gene_symbol = None model.addClassToGraph(gene_curie, gene_symbol, self.globaltt['gene']) if live == 'Dead': model.addDeprecatedClass(gene_curie, old_id_category=blv.terms['Gene']) geno.addTaxon(taxon_curie, gene_curie) if gene_synonym is not None and gene_synonym != '': model.addSynonym(gene_curie, gene_synonym) if limit is not None and reader.line_num > limit: break
def _add_variant_gene_relationship(self, variant_id, hgnc_symbol): """ :param variant_id :param hgnc_symbol :return: None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) if hgnc_symbol in self.gene_map: gene_id = self.gene_map[hgnc_symbol] else: gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol)) logger.warn("Can't map gene symbol {0} " "to entrez ID".format(hgnc_symbol)) gu.addClassToGraph(self.graph, gene_id, hgnc_symbol) geno.addAlleleOfGene(variant_id, gene_id) return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # tax_num = '9606' # TODO PYLINT unused tax_id = 'NCBITaxon:9606' tax_label = 'Human' # add genome and taxon geno.addGenome(tax_id, tax_label) # tax label can get added elsewhere model.addClassToGraph(tax_id, None) # label added elsewhere includes = set() includes.add('all') self.process_entries( omimids, self._transform_entry, includes, g, limit) return
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: graph = self.testgraph else: graph = self.graph model = Model(graph) logger.info("Processing: %s", self.files['gene_ids']['file']) line_counter = 0 geno = Genotype(graph) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live, gene_type) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:' + taxon_num gene_id = 'WormBase:' + gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None model.addClassToGraph(gene_id, gene_symbol, self.globaltt['gene']) if live == 'Dead': model.addDeprecatedClass(gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '' and gene_synonym is not None: model.addSynonym(gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species graph = self.graph geno = Genotype(graph) model = Model(graph) LOG.info("Adding equivalent assembly identifiers") for sp in self.species: tax_id = self.globaltt[sp] txid_num = tax_id.split(':')[1] for key in self.files[txid_num]['assembly']: ucsc_id = key try: ucsc_label = ucsc_id.split(':')[1] except IndexError: LOG.error('%s Assembly id: "%s" is problematic', sp, key) continue if key in self.localtt: mapped_id = self.localtt[key] else: LOG.error( '%s Assembly id: "%s" is not in local translation table', sp, key) mapped_label = mapped_id.split(':')[1] mapped_label = 'NCBI build ' + str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) model.addSameIndividual(ucsc_id, mapped_id) return
class GenotypeTestCase(unittest.TestCase): def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph) def tearDown(self): self.genotype = None def test_addGenotype(self): from rdflib.namespace import RDFS, URIRef from rdflib import Literal from dipper.utils.CurieUtil import CurieUtil cutil = CurieUtil(self.curie_map) gid = 'MGI:5515892' label = \ 'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]' self.genotype.addGenotype(gid, label) self.assertTrue((URIRef(cutil.get_uri(gid)), RDFS['label'], Literal(label)) in self.genotype.graph)
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'decipher') self.dataset = Dataset( 'decipher', 'Development Disorder Genotype – Phenotype Database', 'https://decipher.sanger.ac.uk/', None, 'https://decipher.sanger.ac.uk/legal') if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] self.g = self.graph self.geno = Genotype(self.g) self.model = Model(self.g) return
def parse(self, limit=None): if limit is not None: LOG.info("Only parsing first %s rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.graph = self.testgraph else: self.graph = self.graph self.geno = Genotype(self.graph) # rare disease-phenotype associations self._process_ddg2p_annotations(limit) LOG.info("Finished parsing.") return
class GenotypeTestCase(unittest.TestCase): def setUp(self): self.graph = Graph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph) def tearDown(self): self.genotype = None def test_addGenotype(self): from rdflib.namespace import RDFS,URIRef from rdflib import Literal from dipper.utils.CurieUtil import CurieUtil cu = CurieUtil(self.curie_map) id = 'MGI:5515892' label = \ 'Pmp22<Tr-2J>/Pmp22<+> [C57BL/6J-Pmp22<Tr-2J>/GrsrJ]' self.genotype.addGenotype(id, label) self.assertTrue((URIRef(cu.get_uri(id)), RDFS['label'], Literal(label)) in self.genotype.graph)
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") # pub_map = dict() # file_path = '/'.join((self.rawdir, # self.static_files['publications']['file'])) # if os.path.exists(file_path) is True: # pub_map = self._parse_publication_file( # self.static_files['publications']['file'] # ) if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) self.pathway = Pathway(self.g) self._parse_ctd_file( limit, self.files['chemical_disease_interactions']['file']) self._parse_ctd_file(limit, self.files['gene_pathway']['file']) self._parse_ctd_file(limit, self.files['gene_disease']['file']) self._parse_curated_chem_disease(limit) logger.info("Done parsing files.") return
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") # pub_map = dict() # file_path = '/'.join((self.rawdir, # self.static_files['publications']['file'])) # if os.path.exists(file_path) is True: # pub_map = self._parse_publication_file( # self.static_files['publications']['file'] # ) if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) self.path = Pathway(self.g, self.nobnodes) self._parse_ctd_file( limit, self.files['chemical_disease_interactions']['file']) self._parse_ctd_file(limit, self.files['gene_pathway']['file']) self._parse_ctd_file(limit, self.files['gene_disease']['file']) self._parse_curated_chem_disease(limit) self.gu.loadAllProperties(self.g) self.gu.loadProperties( self.g, G2PAssoc.object_properties, self.gu.OBJPROP) self.gu.loadProperties( self.g, G2PAssoc.datatype_properties, self.gu.DATAPROP) self.gu.loadProperties( self.g, G2PAssoc.annotation_properties, self.gu.ANNOTPROP) self.gu.loadProperties( self.g, Pathway.object_properties, self.gu.OBJPROP) self.load_bindings() logger.info("Done parsing files.") return
def parse(self, limit=None): """ MPD data is delivered in four separate csv files and one xml file, which we process iteratively and write out as one large graph. :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph self.geno = Genotype(self.testgraph) else: g = self.graph self._process_straininfo(limit) # the following will provide us the hash-lookups # These must be processed in a specific order # mapping between assays and ontology terms self._process_ontology_mappings_file(limit) # this is the metadata about the measurements self._process_measurements_file(limit) # get all the measurements per strain self._process_strainmeans_file(limit) # The following will use the hash populated above # to lookup the ids when filling in the graph self._fill_provenance_graph(limit) logger.info("Finished parsing.") self.load_bindings() gu = GraphUtils(curie_map.get()) gu.loadAllProperties(g) gu.loadProperties(g, G2PAssoc.object_properties, GraphUtils.OBJPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, GraphUtils.OBJPROP) gu.loadProperties( g, G2PAssoc.annotation_properties, GraphUtils.ANNOTPROP) logger.info("Found %d nodes", len(self.graph)) return
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph else: g = self.graph tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(tmap, limit) geno = Genotype(g) # organisms = ['chicken'] organisms = [ 'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for o in organisms: tax_id = self._get_tax_by_common_name(o) geno.addGenome(tax_id, o) build_id = None build = None k = o + '_bp' if k in self.files: file = self.files[k]['file'] m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file) if m is None: logger.error("Can't match a gff build") else: build = m.group(1) build_id = self._map_build_by_abbrev(build) logger.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, tax_id) if build_id is not None: self._process_QTLs_genomic_location( '/'.join((self.rawdir, file)), tax_id, build_id, build, limit) k = o+'_cm' if k in self.files: file = self.files[k]['file'] self._process_QTLs_genetic_location( '/'.join((self.rawdir, file)), tax_id, o, limit) logger.info("Finished parsing") return
def parse(self, limit=None): # names of tables to iterate - probably don't need all these: # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword, # Article_People, Article_Phene, Articles, Breed, Breed_Phene, # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords, # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People, # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms self.scrub() if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) # we do three passes through the file # first process species (two others reference this one) self.process_species(limit) # then, process the breeds, genes, articles, and other static stuff self.process_classes(limit) # next process the association data self.process_associations(limit) # process the vertebrate orthology for genes # that are annotated with phenotypes ncbi = NCBIGene() ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes) self.load_core_bindings() self.load_bindings() logger.info("Done parsing.") self.write_molgen_report() return
def __init__(self): Source.__init__(self, 'omia') self.load_bindings() self.dataset = Dataset( 'omia', 'Online Mendelian Inheritance in Animals', 'http://omia.angis.org.au', None, None, 'http://sydney.edu.au/disclaimer.shtml') self.id_hash = { 'article': {}, 'phene': {}, 'breed': {}, 'taxon': {}, 'gene': {} } self.label_hash = {} self.gu = GraphUtils(curie_map.get()) # used to store the omia to omim phene mappings self.omia_omim_map = {} # used to store the unique genes that have phenes # (for fetching orthology) self.annotated_genes = set() self.test_ids = { 'disease': [ 'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201', 'OMIA:000810', 'OMIA:001400'], 'gene': [ 492297, 434, 492296, 3430235, 200685834, 394659996, 200685845, 28713538, 291822383], 'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825], # to be filled in during parsing of breed table # for lookup by breed-associations 'breed': [] } # to store a map of omia ids and any molecular info # to write a report for curation self.stored_omia_mol_gen = {} self.g = self.graph self.geno = Genotype(self.g) return
def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) # rare disease-phenotype associations self._process_ddg2p_annotations(limit) logger.info("Finished parsing.") return
def __init__(self): Source.__init__(self, 'decipher') self.load_bindings() self.dataset = Dataset( 'decipher', 'Development Disorder Genotype – Phenotype Database', 'https://decipher.sanger.ac.uk/', None, 'https://decipher.sanger.ac.uk/legal') if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) self.g = self.graph self.geno = Genotype(self.g) return
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) src_key = 'chemical_disease_associations' self._parse_ctd_file(limit, src_key)
def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank): """ :param gene_id: str Non curified ID :param gene_label: str Gene symbol :param anatomy_curie: str curified anatomy term :param rank: str rank :return: None """ g2a_association = Assoc(self.graph, self.name) genotype = Genotype(self.graph) model = Model(self.graph) gene_curie = "ENSEMBL:{}".format(gene_id) rank = re.sub(r',', '', rank) model.addIndividualToGraph(ind_id=gene_curie, label=None, ind_type=genotype.genoparts['gene']) g2a_association.sub = gene_curie g2a_association.obj = anatomy_curie g2a_association.rel = Assoc.object_properties['expressed_in'] g2a_association.add_association_to_graph() g2a_association.add_predicate_object( Assoc.datatype_properties['has_quantifier'], float(rank), 'Literal', 'xsd:float') return
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") # pub_map = dict() # file_path = '/'.join((self.rawdir, # self.static_files['publications']['file'])) # if os.path.exists(file_path) is True: # pub_map = self._parse_publication_file( # self.static_files['publications']['file'] # ) if self.testOnly: self.testMode = True self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) self._parse_ctd_file( limit, self.files['chemical_disease_interactions']['file']) self._parse_ctd_file(limit, self.files['gene_pathway']['file']) self._parse_ctd_file(limit, self.files['gene_disease']['file']) self._parse_curated_chem_disease(limit) logger.info("Done parsing files.") return
def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species graph = self.graph geno = Genotype(graph) model = Model(graph) logger.info("Adding equivalent assembly identifiers") for sp in self.species: tax_id = self.resolve(sp) txid_num = tax_id.split(':')[1] for key in self.files[txid_num]['assembly']: ucsc_id = key try: ucsc_label = ucsc_id.split(':')[1] except IndexError: logger.error('%s Assembly id: "%s" is problematic', sp, key) continue if key in self.localtt: mapped_id = self.localtt[key] else: logger.error( '%s Assembly id: "%s" is not in local translation table', sp, key) mapped_label = mapped_id.split(':')[1] mapped_label = 'NCBI build ' + str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) model.addSameIndividual(ucsc_id, mapped_id) return
def _parse_genepage2gene(self, limit) -> Dict[str, List[str]]: """ :return: """ src_key = 'genepage2gene' columns = self.files[src_key]['columns'] raw = '/'.join((self.rawdir, self.files[src_key]['file'])) geno = Genotype(self.graph) genepage2gene = {} LOG.info("Processing GenePage to Gene file") with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') for row in reader: gene_page = row[columns.index('gene_page_id')] # gene_page_label = row[columns.index('gene_page_label')] tropicalis_id = row[columns.index('tropicalis_id')] tropicalis_label = row[columns.index('tropicalis_label')] laevis_l_id = row[columns.index('laevis_l_id')] laevis_l_label = row[columns.index('laevis_l_label')] laevis_s_id = row[columns.index('laevis_s_id')] laevis_s_label = row[columns.index('laevis_s_label')] tropicalis_curie = 'Xenbase:' + tropicalis_id laevis_l_curie = 'Xenbase:' + laevis_l_id laevis_s_curie = 'Xenbase:' + laevis_s_id genepage2gene[gene_page] = [tropicalis_curie, laevis_l_curie, laevis_s_curie] geno.addGene(tropicalis_curie, tropicalis_label) geno.addGene(laevis_l_curie, laevis_l_label) geno.addGene(laevis_s_curie, laevis_s_label) if not self.test_mode and limit is not None and reader.line_num > limit: break return genepage2gene
def _add_g2p_assoc(self, graph, strain_id, sex, assay_id, phenotypes, comment): """ Create an association between a sex-specific strain id and each of the phenotypes. Here, we create a genotype from the strain, and a sex-specific genotype. Each of those genotypes are created as anonymous nodes. The evidence code is hardcoded to be: ECO:experimental_phenotypic_evidence. :param g: :param strain_id: :param sex: :param assay_id: :param phenotypes: a list of phenotypes to association with the strain :param comment: :return: """ geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['experimental phenotypic evidence'] strain_label = self.idlabel_hash.get(strain_id) # strain genotype genotype_id = '_' + '-'.join((re.sub(r':', '', strain_id), 'genotype')) genotype_label = '[' + strain_label + ']' sex_specific_genotype_id = '_' + '-'.join( (re.sub(r':', '', strain_id), sex, 'genotype')) if strain_label is not None: sex_specific_genotype_label = strain_label + ' (' + sex + ')' else: sex_specific_genotype_label = strain_id + '(' + sex + ')' genotype_type = self.globaltt['sex_qualified_genotype'] if sex == 'm': genotype_type = self.globaltt['male_genotype'] elif sex == 'f': genotype_type = self.globaltt['female_genotype'] # add the genotype to strain connection geno.addGenotype(genotype_id, genotype_label, self.globaltt['genomic_background']) graph.addTriple(strain_id, self.globaltt['has_genotype'], genotype_id) geno.addGenotype(sex_specific_genotype_id, sex_specific_genotype_label, genotype_type) # add the strain as the background for the genotype graph.addTriple(sex_specific_genotype_id, self.globaltt['has_sex_agnostic_part'], genotype_id) # ############# BUILD THE G2P ASSOC ############# # TODO add more provenance info when that model is completed if phenotypes is not None: for phenotype_id in phenotypes: assoc = G2PAssoc(graph, self.name, sex_specific_genotype_id, phenotype_id) assoc.add_evidence(assay_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model.addComment(assoc_id, comment) model._addSexSpecificity(assoc_id, self.resolve(sex)) return
def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph)
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if '7955' in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if '6239' in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not (re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def process_rnai_phenotypes(self, limit=None): raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing RNAi phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_num, gene_alt_symbol, phenotype_label, phenotype_id, rnai_and_refs) = row # WBGene00001908 F17E9.9 locomotion variant WBPhenotype:0000643 WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395 # WBGene00001908 F17E9.9 avoids bacterial lawn WBPhenotype:0000402 WBRNAi00095640|WBPaper00040984 # WBGene00001908 F17E9.9 RAB-11 recycling endosome localization variant WBPhenotype:0002107 WBRNAi00090830|WBPaper00041129 if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num # refs = list() # TODO unused # the rnai_and_refs has this so that # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054 # space delimited between RNAi sets; # then each RNAi should have a paper rnai_sets = re.split(r' ', rnai_and_refs) for s in rnai_sets: # get the rnai_id (rnai_num, ref_num) = re.split(r'\|', s) if len(re.split(r'\|', s)) > 2: logger.warning( "There's an unexpected number of items in %s", s) if rnai_num not in self.rnai_gene_map: self.rnai_gene_map[rnai_num] = set() # to use for looking up later self.rnai_gene_map[rnai_num].add(gene_num) rnai_id = 'WormBase:'+rnai_num geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) # make the "allele" of the gene # that is targeted by the reagent allele_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) allele_label = gene_alt_symbol+'<'+rnai_num+'>' geno.addReagentTargetedGene( rnai_id, gene_id, allele_id, allele_label) assoc = G2PAssoc(self.name, allele_id, phenotype_id) assoc.add_source('WormBase:'+ref_num) # eco_id = 'ECO:0000019' # RNAi evidence # TODO unused assoc.add_association_to_graph(g) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:'+build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat']: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=")for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:'+attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution='+sub if ins is not None: desc = 'insertion='+ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:'+name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: gu.addSynonym(g, fid, name) if desc is not None: gu.addDescription(g, fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: gu.addSynonym(g, fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) f = Feature(fid, flabel, ftype) f.addFeatureStartLocation(start, chr_id, strand) f.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True f.addFeatureToGraph(g, True, None, feature_is_class) if note is not None: gu.addDescription(g, fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning( "Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:'+allele_num gene_id = 'WormBase:'+gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) geno.addReagentTargetedGene( rnai_id, 'WormBase:'+gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_'+'-'.join((gene_num, allele_num)) if self.nobnodes: vl_id = ':'+vl_id geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) r = Reference(ref) if re.search(r'Person', ref): r.setType(r.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') r.addRefToGraph(g) assoc.add_source(ref) assoc.add_association_to_graph(g) # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance( cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance( str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name model.addIndividualToGraph( vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info( "No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match( r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub( m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub( r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub( r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub( r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc( g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index('marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index('phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index('allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index('strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index('pipeline_stable_id')].strip() procedure_stable_id = row[col.index('procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index('parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index('statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender # sometimes phenotype ids are missing. (about 711 early 2020) if mp_term_id is None or mp_term_id == '': LOG.warning( "No phenotype id specified for row %d", reader.line_num) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, mp_term_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) model = Model(g) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:'+str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:'+str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) model.addClassToGraph( gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): model.addSynonym(gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(g, self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:'+r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:'+r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:'+r.find('Reference').text else: pass # skip the others for now if eqid is not None: model.addClassToGraph(eqid, None) model.addEquivalentClass(gene_id, eqid) elem.clear() # empty the element if self.testMode and limit is not None and line_counter > limit: return return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) myfile = "/".join((self.rawdir, self.files["disease-gene"]["file"])) for event, elem in ET.iterparse(myfile): if elem.tag == "Disorder": # get the element name and id # id = elem.get('id') # some internal identifier disorder_num = elem.find("OrphaNumber").text disorder_id = "Orphanet:" + str(disorder_num) if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]: continue disorder_label = elem.find("Name").text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find("GeneList") for gene in gene_list.findall("Gene"): gene_iid = gene.get("id") gene_type = gene.find("GeneType").get("id") gene_iid_to_type[gene_iid] = gene_type gu.addClassToGraph(g, disorder_id, disorder_label) # assuming that these are in the ontology assoc_list = elem.find("DisorderGeneAssociationList") for a in assoc_list.findall("DisorderGeneAssociation"): gene_iid = a.find(".//Gene").get("id") gene_name = a.find(".//Gene/Name").text gene_symbol = a.find(".//Gene/Symbol").text gene_num = a.find("./Gene/OrphaNumber").text gene_id = "Orphanet:" + str(gene_num) gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid]) gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find("./Gene/SynonymList") if int(syn_list.get("count")) > 0: for s in syn_list.findall("./Synonym"): gu.addSynonym(g, gene_id, s.text) dgtype = a.find("DisorderGeneAssociationType").get("id") rel_id = self._map_rel_id(dgtype) dg_label = a.find("./DisorderGeneAssociationType/Name").text if rel_id is None: logger.warn( "Cannot map association type (%s) to RO for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol, ) continue alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL" alt_label = " ".join( ("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label) ) if self.nobnodes: alt_locus_id = ":" + alt_locus_id gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"]) geno.addAlleleOfGene(alt_locus_id, gene_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = a.find("DisorderGeneAssociationStatus").get("id") eco_id = "ECO:0000323" # imported automatically asserted information used in automatic assertion if status_code == "17991": # Assessed # TODO are these internal ids stable between releases? eco_id = "ECO:0000322" # imported manually asserted information used in automatic assertion # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) rlist = a.find("./Gene/ExternalReferenceList") eqid = None for r in rlist.findall("ExternalReference"): if r.find("Source").text == "Ensembl": eqid = "ENSEMBL:" + r.find("Reference").text elif r.find("Source").text == "HGNC": eqid = "HGNC:" + r.find("Reference").text elif r.find("Source").text == "OMIM": eqid = "OMIM:" + r.find("Reference").text else: pass # skip the others for now if eqid is not None: gu.addClassToGraph(g, eqid, None) gu.addEquivalentClass(g, gene_id, eqid) pass elem.clear() # discard the element if self.testMode and limit is not None and line_counter > limit: return gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadAllProperties(g) return