def _add_snp_to_graph( self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): # constants tax_id = 'NCBITaxon:9606' genome_version = 'GRCh38' if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None \ and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, genome_version, 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' f = Feature( g, snp_id, snp_label.strip(), Feature.types['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': f.addFeatureStartLocation(chrom_pos, chrom_id) f.addFeatureEndLocation(chrom_pos, chrom_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for c in re.split(r';', context): cid = self._map_variant_type(c.strip()) if cid is not None: model.addType(snp_id, cid) return
def _get_phenotypicseries_parents(entry, g): """ Extract the phenotypic series parent relationship out of the entry :param entry: :return: """ model = Model(g) omimid = 'OMIM:'+str(entry['mimNumber']) # the phenotypic series mappings serieslist = [] if 'phenotypicSeriesExists' in entry: if entry['phenotypicSeriesExists'] is True: if 'phenotypeMapList' in entry: phenolist = entry['phenotypeMapList'] for p in phenolist: serieslist.append( p['phenotypeMap']['phenotypicSeriesNumber']) if 'geneMap' in entry and \ 'phenotypeMapList' in entry['geneMap']: phenolist = entry['geneMap']['phenotypeMapList'] for p in phenolist: if 'phenotypicSeriesNumber' in p['phenotypeMap']: serieslist.append( p['phenotypeMap']['phenotypicSeriesNumber']) # add this entry as a subclass of the series entry for ser in serieslist: series_id = 'OMIM:'+ser model.addClassToGraph(series_id, None) model.addSubClass(omimid, series_id) return
def _add_assertion_provenance( self, assoc_id, evidence_line_bnode, impc_map): """ Add assertion level provenance, currently always IMPC :param assoc_id: :param evidence_line_bnode: :return: """ provenance_model = Provenance(self.graph) model = Model(self.graph) assertion_bnode = self.make_id("assertion{0}{1}".format( assoc_id, impc_map['asserted_by']['IMPC']), '_') model.addIndividualToGraph( assertion_bnode, None, provenance_model.provenance_types['assertion']) provenance_model.add_assertion( assertion_bnode, impc_map['asserted_by']['IMPC'], 'International Mouse Phenotyping Consortium') self.graph.addTriple( assoc_id, provenance_model.object_properties['is_asserted_in'], assertion_bnode) self.graph.addTriple( assertion_bnode, provenance_model.object_properties['is_assertion_supported_by'], evidence_line_bnode) return
def _parse_patient_phenotypes(self, file, limit=None): """ :param file: file handler :param limit: limit rows processed :return: """ model = Model(self.graph) line_counter = 0 reader = csv.reader(file, delimiter="\t") for row in reader: (patient_id, hpo_curie, present) = row patient_curie = ':{0}'.format(patient_id) if patient_id == 'Patient': # skip header line_counter += 1 continue model.addPerson(patient_curie, patient_id) self.graph.addTriple( patient_curie, self.globaltt['has phenotype'], self.globaltt['disease']) if present == 'yes': self.graph.addTriple( patient_curie, self.globaltt['has phenotype'], hpo_curie) line_counter += 1 if not self.test_mode and limit is not None \ and line_counter >= limit: break
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) zp_file = '/'.join((self.rawdir, self.files['zpmap']['file'])) g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file'])) zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file) with open(g2p_file, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (internal_id, symbol, gene_id, subterm1_id, subterm1_label, pc_rel_id, pc_rel_label, superterm1_id, superterm1_label, quality_id, quality_name, modifier, subterm2_id, subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id, superterm2_label, fish_id, fish_label, start_stage, end_stage, environment, pub_id, figure_id, unknown_field) = row zp_id = zfin_parser._map_sextuple_to_phenotype( superterm1_id, subterm1_id, quality_id, superterm2_id, subterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, Reference.ref_types['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence('ECO:0000059') assoc.add_association_to_graph()
def _add_assertion_provenance( self, assoc_id, evidence_line_bnode ): """ Add assertion level provenance, currently always IMPC :param assoc_id: :param evidence_line_bnode: :return: """ provenance_model = Provenance(self.graph) model = Model(self.graph) assertion_bnode = self.make_id( "assertion{0}{1}".format(assoc_id, self.localtt['IMPC']), '_') model.addIndividualToGraph(assertion_bnode, None, self.globaltt['assertion']) provenance_model.add_assertion( assertion_bnode, self.localtt['IMPC'], 'International Mouse Phenotyping Consortium') self.graph.addTriple( assoc_id, self.globaltt['proposition_asserted_in'], assertion_bnode) self.graph.addTriple( assertion_bnode, self.resolve('is_assertion_supported_by_evidence'), # "SEPIO:0000111" evidence_line_bnode) return
def _process_collection(self, collection_id, label, page): """ This function will process the data supplied internally about the repository from Coriell. Triples: Repository a ERO:collection rdf:label Literal(label) foaf:page Literal(page) :param collection_id: :param label: :param page: :return: """ # ############# BUILD THE CELL LINE REPOSITORY ############# for graph in [self.graph, self.testgraph]: # TODO: How to devise a label for each repository? model = Model(graph) reference = Reference(graph) repo_id = 'CoriellCollection:' + collection_id repo_label = label repo_page = page model.addIndividualToGraph( repo_id, repo_label, self.globaltt['collection']) reference.addPage(repo_id, repo_page) return
def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip for line in f1: line_counter += 1 row = line.split('\t') ( morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = row # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class model.addClassToGraph(hp_id, None) # Add the HP ID as an equivalent class model.addEquivalentClass(morphology_term_id, hp_id) else: LOG.warning('No matching HP term for %s', morphology_term_label) if limit is not None and line_counter > limit: break return
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing disease models") geno = Genotype(g) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:'+gene_num # make a variant of the gene vl = '_:'+'-'.join((gene_num, 'unspecified')) vl_label = 'some variant of '+gene_symbol geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) animal_id = geno.make_experimental_model_with_genotype( vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc( g, self.name, animal_id, disease_id, model.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph() return
def _get_mappedids(self, entry, g): """ Extract the Orphanet and UMLS ids as equivalences from the entry :param entry: :return: """ model = Model(g) omimid = 'OMIM:'+str(entry['mimNumber']) orpha_mappings = [] if 'externalLinks' in entry: links = entry['externalLinks'] if 'orphanetDiseases' in links: # triple semi-colon delimited list of # double semi-colon delimited orphanet ID/disease pairs # 2970;;566;;Prune belly syndrome items = links['orphanetDiseases'].split(';;;') for i in items: # note 'internal_num unused (orpha_num, internal_num, orpha_label) = i.split(';;') orpha_id = 'Orphanet:'+orpha_num.strip() orpha_mappings.append(orpha_id) model.addClassToGraph(orpha_id, orpha_label.strip()) model.addXref(omimid, orpha_id) if 'umlsIDs' in links: umls_mappings = links['umlsIDs'].split(',') for i in umls_mappings: umls_id = 'UMLS:'+i model.addClassToGraph(umls_id, None) model.addXref(omimid, umls_id) return
def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num if concise_description != 'none available': model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '['+d+']')) descs[d] = text model.addDescription(gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _add_snp_to_graph( self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'], 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = str(risk_allele_frequency) + ' [risk allele frequency]' feat = Feature( graph, snp_id, snp_label.strip(), self.globaltt['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': feat.addFeatureStartLocation(chrom_pos, chrom_id) feat.addFeatureEndLocation(chrom_pos, chrom_id) feat.addFeatureToGraph() feat.addTaxonToFeature(self.globaltt['H**o sapiens']) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for ctx in re.split(r';', context): ctx = ctx.strip() cid = self.resolve(ctx, False) if cid != ctx: model.addType(snp_id, cid) return
def process_pub_xrefs(self, limit=None): raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing publication xrefs") line_counter = 0 with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (wb_ref, xref) = row # WBPaper00000009 pmid8805<BR> # WBPaper00000011 doi10.1139/z78-244<BR> # WBPaper00000012 cgc12<BR> if self.testMode and wb_ref not in self.test_ids['pub']: continue ref_id = 'WormBase:'+wb_ref xref_id = None r = None xref = re.sub(r'<BR>', '', xref) xref = xref.strip() if re.match(r'pmid', xref): xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref) reference = Reference( g, xref_id, Reference.ref_types['journal_article']) elif re.search(r'[\(\)\<\>\[\]\s]', xref): continue elif re.match(r'doi', xref): xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip()) reference = Reference(g, xref_id) elif re.match(r'cgc', xref): # TODO not sure what to do here with cgc xrefs continue else: # logger.debug("Other xrefs like %s", xref) continue if xref_id is not None: reference.addRefToGraph() model.addSameIndividual(ref_id, xref_id) if not self.testMode \ and limit is not None and line_counter > limit: break return
class Environment(): """ These methods provide convenient methods to add items related to an experimental environment and it's parts to a supplied graph. This is a stub. """ def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map return def addEnvironment( self, env_id, env_label, env_type=None, env_description=None): if env_type is None: env_type = self.globaltt['environmental_system'] self.model.addIndividualToGraph( env_id, env_label, env_type, env_description) return def addEnvironmentalCondition( self, cond_id, cond_label, cond_type=None, cond_description=None): if cond_type is None: cond_type = self.globaltt['environmental_condition'] self.model.addIndividualToGraph( cond_id, cond_label, cond_type, cond_description) return def addComponentToEnvironment(self, env_id, component_id): self.graph.addTriple(env_id, self.globaltt['has_part'], component_id) return def addComponentAttributes(self, component_id, entity_id, value=None, unit=None): self.graph.addTriple( component_id, self.globaltt['has_part'], entity_id) # TODO add value and units return
def _add_variant_trait_association(self, variant_id, mapped_trait_uri, efo_ontology, pubmed_id, description=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for trait in re.split(r',', mapped_trait_uri): trait = trait.strip() trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_", "EFO:") phenotype_query = """ SELECT ?trait WHERE {{ <{0}> rdfs:subClassOf+ <http://www.ebi.ac.uk/efo/EFO_0000651> . <{0}> rdfs:label ?trait . }} """.format(trait) query_result = efo_ontology.query(phenotype_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_curie): model.addClassToGraph( trait_curie, list(query_result)[0][0], 'UPHENO:0001001') pubmed_curie = 'PMID:' + pubmed_id ref = Reference( g, pubmed_curie, Reference.ref_types['journal_article']) ref.addRefToGraph() assoc = G2PAssoc( g, self.name, variant_id, trait_curie, model.object_properties['contributes_to']) assoc.add_source(pubmed_curie) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) if description is not None: assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) if trait_curie is not None: assoc.add_association_to_graph()
def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num, disorder_label, phene_key): geno = Genotype(g) model = Model(g) disorder_id = ':'.join(('OMIM', disorder_num)) rel_id = model.object_properties['has_phenotype'] # default rel_label = 'causes' if re.match(r'\[', disorder_label): rel_id = model.object_properties['is_marker_for'] rel_label = 'is a marker for' elif re.match(r'\{', disorder_label): rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' elif re.match(r'\?', disorder_label): # this is a questionable mapping! skip? rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' evidence = self._map_phene_mapping_code_to_eco(phene_key) # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. # so we make an anonymous alternate locus, # and put that in the association. # but we only need to do that in the cases when it's not an NCBIGene # (as that is a sequence feature itself) if re.match(r'OMIM:', gene_id): alt_locus = '_:'+re.sub(r':', '', gene_id)+'-'+disorder_num+'VL' alt_label = gene_symbol.strip() if alt_label is not None and alt_label != '': alt_label = \ ' '.join(('some variant of', alt_label, 'that', rel_label, disorder_label)) else: alt_label = None model.addIndividualToGraph( alt_locus, alt_label, Genotype.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) else: # assume it's already been added alt_locus = gene_id assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id) assoc.add_evidence(evidence) assoc.add_association_to_graph() return
def _add_gene_to_graph(self, gene, variant_bnode, gene_id, relation): """ :param gene: :param variant_bnode: :return: """ model = Model(self.graph) if gene_id: self.graph.addTriple(variant_bnode, relation, gene_id) elif gene: LOG.info("gene %s not mapped to NCBI gene, making blank node", gene) gene_bnode = self.make_id("{0}".format(gene), "_") model.addIndividualToGraph(gene_bnode, gene) self.graph.addTriple(variant_bnode, relation, gene_bnode)
def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("%s is not a graph", graph) # assert ref_id is not None self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_type[:4] not in ('IAO:', 'SIO:'): LOG.warning("Got Pub ref type of: %s", ref_type) if ref_id is not None and ref_id[:4] == 'http': self.ref_url = ref_id return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__( graph_type, are_bnodes_skolemized, 'decipher', ingest_title='Development Disorder Genotype Phenotype Database', ingest_url='https://decipher.sanger.ac.uk/', license_url='https://decipher.sanger.ac.uk/legal', data_rights='https://decipher.sanger.ac.uk/datasharing', # file_handle=None ) if 'disease' not in self.all_test_ids: LOG.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = self.all_test_ids['disease'] self.graph = self.graph self.geno = Genotype(self.graph) self.model = Model(self.graph) self.graph_type = graph_type self.are_bnodes_skolemized = are_bnodes_skolemized return
def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return
def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) return
def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) model = Model(self.graph) line_counter = 0 with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') header = next(filereader) line_counter = 1 colcount = len(header) if colcount != 4: # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID') logger.error("Unexpected Header ", header) exit(-1) for row in filereader: line_counter += 1 if len(row) != colcount: logger.error("Unexpected row. got: ", row) logger.error("Expected data for: ", header) exit(-1) (shortname, title, nbk_num, pmid) = row gr_id = 'GeneReviews:'+nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or line_counter < limit: model.addClassToGraph(gr_id, title) model.addSynonym(gr_id, shortname) # TODO include the new PMID? return
def _get_mapped_gene_ids(self, entry, g): gene_ids = [] model = Model(g) omimid = 'OMIM:'+str(entry['mimNumber']) if 'externalLinks' in entry: links = entry['externalLinks'] omimtype = self._get_omimtype(entry) if 'geneIDs' in links: entrez_mappings = links['geneIDs'] gene_ids = entrez_mappings.split(',') self.omim_ncbigene_idmap[omimid] = gene_ids if omimtype == Genotype.genoparts['gene']: for i in gene_ids: model.addEquivalentClass(omimid, 'NCBIGene:'+str(i)) return gene_ids
def _process_interactions(self, row): """ Process row of CTD data from CTD_chemicals_diseases.tsv.gz and generate triples. Only create associations based on direct evidence (not using the inferred-via-gene), and unambiguous relationships. (Ambiguous ones will be processed in the sister method using the disambiguated file). There are no OMIM ids for diseases in these cases, so we associate with only the mesh disease ids. Args: :param row (list): row of CTD data Returns: :return None """ model = Model(self.graph) self._check_list_len(row, 10) (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence, inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row if direct_evidence == '': return evidence_pattern = re.compile(r'^therapeutic|marker\/mechanism$') # dual_evidence = re.compile(r'^marker\/mechanism\|therapeutic$') # filter on those diseases that are mapped to omim ids in the test set intersect = list( set(['OMIM:' + str(i) for i in omim_ids.split('|')] + [disease_id]) & set(self.test_diseaseids)) if self.test_mode and len(intersect) < 1: return chem_id = 'MESH:' + chem_id reference_list = self._process_pubmed_ids(pubmed_ids) if re.match(evidence_pattern, direct_evidence): rel_id = self.resolve(direct_evidence) model.addClassToGraph(chem_id, chem_name) model.addClassToGraph(disease_id, None) self._make_association(chem_id, disease_id, rel_id, reference_list) else: # there's dual evidence, but haven't mapped the pubs pass # LOG.debug( # "Dual evidence for %s (%s) and %s (%s)", # chem_name, chem_id, disease_name, disease_id) return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ model = Model(self.graph) record['relation']['id'] = self.resolve("has phenotype") # define the triple gene = record['subject']['id'] relation = record['relation']['id'] phenotype = record['object']['id'] # instantiate the association g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=phenotype, pred=relation) # add the references references = record['evidence']['has_supporting_reference'] # created RGDRef prefix in curie map to route to proper reference URL in RGD references = [ x.replace('RGD', 'RGDRef') if 'PMID' not in x else x for x in references] if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference( self.graph, references[0], self.globaltt['publication'] ) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list # This seems to be specific to this source and # there could be non-equivalent references in this list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add the date created on g2p_assoc.add_date(date=record['date']) g2p_assoc.add_evidence(self.resolve(record['evidence']['type'])) # ?set where? g2p_assoc.add_association_to_graph() return
def _process_genes_kegg2ncbi(self, limit=None): """ This method maps the KEGG human gene IDs to the corresponding NCBI Gene IDs. Triples created: <kegg_gene_id> is a class <ncbi_gene_id> is a class <kegg_gene_id> equivalentClass <ncbi_gene_id> :param limit: :return: """ LOG.info("Processing KEGG gene IDs to NCBI gene IDs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files['ncbi']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, ncbi_gene_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue # Adjust the NCBI gene ID prefix. ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id) kegg_gene_id = 'KEGG-' + kegg_gene_id # Adding the KEGG gene ID to the graph here is redundant, # unless there happens to be additional gene IDs in this table # not present in the genes table. model.addClassToGraph(kegg_gene_id, None) model.addClassToGraph(ncbi_gene_id, None) model.addEquivalentClass(kegg_gene_id, ncbi_gene_id) if not self.test_mode and ( limit is not None and reader.line_num > limit): break LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
def _add_deprecated_snp( self, snp_id, snp_id_current, merged, chrom_num, chrom_pos): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) location = self._make_location_curie(chrom_num, chrom_pos) # add deprecation information if merged == '1' and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' current_rs_id += str(snp_id_current) if location is not None: if location not in self.id_location_map: self.id_location_map[location] = set(current_rs_id) else: self.id_location_map[location].add(current_rs_id) model.addDeprecatedIndividual(snp_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? model.makeLeader(current_rs_id) else: model.makeLeader(snp_id)
def _process_orthologs(self, raw, limit=None): """ This method maps orthologs for a species to the KEGG orthology classes. Triples created: <gene_id> is a class <orthology_class_id> is a class <assoc_id> has subject <gene_id> <assoc_id> has object <orthology_class_id> :param limit: :return: """ LOG.info("Processing orthologs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (gene_id, orthology_class_id) = row orthology_class_id = 'KEGG:'+orthology_class_id.strip() gene_id = 'KEGG:' + gene_id.strip() # note that the panther_id references a group of orthologs, # and is not 1:1 with the rest # add the KO id as a gene-family grouping class OrthologyAssoc( graph, self.name, gene_id, None).add_gene_family_to_graph( orthology_class_id) # add gene and orthology class to graph; # assume labels will be taken care of elsewhere model.addClassToGraph(gene_id, None) model.addClassToGraph(orthology_class_id, None) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with orthologs")
def __init__(self, graph, association): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.association = association return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # tax_num = '9606' # TODO PYLINT unused tax_id = 'NCBITaxon:9606' tax_label = 'Human' # add genome and taxon geno.addGenome(tax_id, tax_label) # tax label can get added elsewhere model.addClassToGraph(tax_id, None) # label added elsewhere includes = set() includes.add('all') self.process_entries( omimids, self._transform_entry, includes, g, limit) return
def _process_qtls_genetic_location( self, raw, src_key, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[src_key]['curie'] if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header in these files, so no header checking col = self.files[src_key]['columns'] for row in filereader: line_counter += 1 if len(row) != len(self.qtl_columns): LOG.warning("Problem parsing in %s row %s\n" "got %s cols but expected %s", raw, row, len(row), len(col)) continue else: qtl_id = row[col.index('qtl_id')].strip() qtl_symbol = row[col.index('qtl_symbol')].strip() trait_name = row[col.index('trait_name')].strip() # assotype = row[col.index('assotype')].strip() # empty = row[col.index('empty')].strip() chromosome = row[col.index('chromosome')].strip() position_cm = row[col.index('position_cm')].strip() range_cm = row[col.index('range_cm')].strip() # flankmark_a2 = row[col.index('flankmark_a2')].strip() # flankmark_a1 = row[col.index('flankmark_a1')].strip() peak_mark = row[col.index('peak_mark')].strip() # flankmark_b1 = row[col.index('flankmark_b1')].strip() # flankmark_b2 = row[col.index('flankmark_b2')].strip() # exp_id = row[col.index('exp_id')].strip() # model_id = row[col.index('model_id')].strip() # test_base = row[col.index('test_base')].strip() # sig_level = row[col.index('sig_level')].strip() # lod_score = row[col.index('lod_score')].strip() # ls_mean = row[col.index('ls_mean')].strip() p_values = row[col.index('p_values')].strip() # f_statistics = row[col.index('f_statistics')].strip() # variance = row[col.index('variance')].strip() # bayes_value = row[col.index('bayes_value')].strip() # likelihood_ratio = row[col.index('likelihood_ratio')].strip() trait_id = row[col.index('trait_id')].strip() # dom_effect = row[col.index('dom_effect')].strip() # add_effect = row[col.index('add_effect')].strip() pubmed_id = row[col.index('pubmed_id')].strip() gene_id = row[col.index('gene_id')].strip() gene_id_src = row[col.index('gene_id_src')].strip() # gene_id_type = row[col.index('gene_id_type')].strip() # empty2 = row[col.index('empty2')].strip() if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # off by one - the following actually gives us (limit + 1) records if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def __init__( self, identifier, data_release_version, ingest_name, ingest_title, ingest_url, ingest_logo=None, ingest_description=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None, distribution_type='ttl', dataset_curie_prefix='MonarchArchive'): if graph_type is None: self.graph = RDFGraph(None, ":".join([dataset_curie_prefix, identifier])) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, ":".join( [dataset_curie_prefix, identifier]), file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, ':'.join([dataset_curie_prefix, identifier])) if data_release_version is not None: self.data_release_version = data_release_version else: self.data_release_version = datetime.today().strftime("%Y%m%d") self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.identifier = ':'.join([dataset_curie_prefix, identifier]) self.citation = set() self.ingest_name = ingest_name self.ingest_title = ingest_title if self.ingest_title is None: self.ingest_title = ":".join([dataset_curie_prefix, identifier]) self.ingest_url = ingest_url self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo self.ingest_description = ingest_description self.date_issued = None self.license_url = license_url self.data_rights = data_rights self.distribution_type = distribution_type # set HCLS resource CURIEs self.summary_level_curie = ':'.join( [dataset_curie_prefix, '#' + identifier]) self.version_level_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/#' + identifier self.distribution_level_turtle_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/rdf/' + \ identifier + "." + self.distribution_type # The following might seem a little odd, but we need to set downloadURLs this # way in order for them to point to where they will end up in archive.MI.org as # of Sept 2019. URL is: # https://archive.MI.org/[release version]/[dist type]/[source].[dist type] self.download_url = \ self.curie_map.get("MonarchArchive") + self.data_release_version + \ "/rdf/" + self.ingest_name + "." + self.distribution_type self._set_summary_level_triples() self._set_version_level_triples() self._set_distribution_level_triples()
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
def _process_omim2disease(self, limit=None): """ This method maps the KEGG disease IDs to the corresponding OMIM disease IDs. Currently this only maps KEGG diseases and OMIM diseases that are 1:1. Triples created: <kegg_disease_id> is a class <omim_disease_id> is a class <kegg_disease_id> hasXref <omim_disease_id> :param limit: :return: """ logger.info("Processing 1:1 KEGG disease to OMIM disease mappings") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 model = Model(g) raw = '/'.join((self.rawdir, self.files['omim2disease']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (omim_disease_id, kegg_disease_id, link_type) = row kegg_disease_id = 'KEGG-'+kegg_disease_id.strip() omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id) # Create hash for the links from OMIM ID -> KEGG ID if omim_disease_id not in self.omim_disease_hash: self.omim_disease_hash[omim_disease_id] = [kegg_disease_id] else: self.omim_disease_hash[ omim_disease_id].append(kegg_disease_id) # Create hash for the links from KEGG ID -> OMIM ID if kegg_disease_id not in self.kegg_disease_hash: self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id] else: self.kegg_disease_hash[ kegg_disease_id].append(omim_disease_id) # Now process the disease hashes # and only pass 1:1 omim disease:KEGG disease entries. for omim_disease_id in self.omim_disease_hash: if self.testMode and \ omim_disease_id not in self.test_ids['disease']: continue if (not self.testMode) and ( limit is not None and line_counter > limit): break line_counter += 1 if len(self.omim_disease_hash[omim_disease_id]) == 1: kegg_disease_id = \ ''.join(self.omim_disease_hash.get(omim_disease_id)) if len(self.kegg_disease_hash[kegg_disease_id]) == 1: # add ids, and deal with the labels separately model.addClassToGraph(kegg_disease_id, None) model.addClassToGraph(omim_disease_id, None) # TODO is this safe? model.addEquivalentClass(kegg_disease_id, omim_disease_id) else: pass # gu.addXref(g, omim_disease_id, kegg_disease_id) # TODO add xrefs if >1:1 mapping? logger.info("Done with KEGG disease to OMIM disease mappings.") return
class Decipher(Source): """ Deprecated - please see the EBIGene2Phen class, which parses the same file but fetches it from EBI which has clearer terms for redistribution, while Decipher has restrictive terms due to containing patient data in password protected datasets. The Decipher group curates and assembles the Development Disorder Genotype Phenotype Database (DDG2P) which is a curated list of genes reported to be associated with developmental disorders, compiled by clinicians as part of the DDD study to facilitate clinical feedback of likely causal variants. Beware that the redistribution of this data is a bit unclear from the [license](https://decipher.sanger.ac.uk/legal). If you intend to distribute this data, be sure to have the appropriate licenses in place. """ files = { 'annot': { 'file': 'ddg2p.zip', 'url': 'https://decipher.sanger.ac.uk/files/ddd/ddg2p.zip', 'headers': [] } } def __init__(self, graph_type, are_bnodes_skolemized, data_release_version=None): super().__init__( graph_type=graph_type, are_bnodes_skized=are_bnodes_skolemized, data_release_version=data_release_version, name='decipher', ingest_title='Development Disorder Genotype Phenotype Database', ingest_url='https://decipher.sanger.ac.uk/', ingest_logo='source-decipher.png', license_url='https://decipher.sanger.ac.uk/legal', data_rights='https://decipher.sanger.ac.uk/datasharing', # file_handle=None ) if 'disease' not in self.all_test_ids: LOG.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = self.all_test_ids['disease'] self.graph = self.graph self.geno = Genotype(self.graph) self.model = Model(self.graph) self.graph_type = graph_type self.are_bnodes_skolemized = are_bnodes_skolemized return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) # since there's a dependency on HGNC files; fetch those too hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized) hgnc.fetch(is_dl_forced) return def parse(self, limit=None): if limit is not None: LOG.info("Only parsing first %s rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.graph = self.testgraph else: self.graph = self.graph self.geno = Genotype(self.graph) # rare disease-phenotype associations self._process_ddg2p_annotations(limit) LOG.info("Finished parsing.") return def _process_ddg2p_annotations(self, limit): """ The ddg2p annotations associate a gene symbol to an omim disease, along with some HPO ids and pubs. The gene symbols come from gencode, which in turn come from HGNC official gene symbols. Therefore, we use the HGNC source class to get the id/symbol mapping for use in our annotations here. According to http://www.gencodegenes.org/faq.html, "Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. Sometimes, when there is no official gene symbol, the Havana clone-based name is used." The kind of variation that is linked to a disease is indicated (LOF, GOF, CNV, etc) in the source data. Here, we create an anonymous variant of the specified gene of the indicated type (mapped to the sequence ontology (SO)). :param limit: :return: """ line_counter = 0 if self.graph is not None: graph = self.graph else: graph = self.graph # in order for this to work, we need to map the HGNC id-symbol; # hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized) # hgnc_symbol_id_map = hgnc.get_symbol_id_map() # Does Not Exists in hgnc myzip = ZipFile('/'.join((self.rawdir, self.files['annot']['file'])), 'r') # use the ddg2p.txt file fname = 'ddg2p.txt' unmapped_omim_counter = 0 unmapped_gene_count = 0 with myzip.open(fname, 'r') as f: f = io.TextIOWrapper(f) reader = csv.reader(f, delimiter='\t', quotechar='\"') # score_means_by_measure = {} # strain_scores_by_measure = {} # TODO theseare unused for row in reader: if re.match(r'#', row[0]): # skip comments continue (gencode_gene_name, mode, category, consequence, disease, omim, ddg2p_id, pubmed_ids, hpo_codes) = row # hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip()) # if hgnc_id is None: if True: LOG.error("FIXME Couldn't map the gene symbol %s to HGNC.", gencode_gene_name) unmapped_gene_count += 1 continue # add the gene # self.model.addClassToGraph(hgnc_id, gencode_gene_name) # TODO make VSLC with the variation # to associate with the disorder # TODO use the Inheritance and Mutation consequence # to classify the VSLCs # allele_id = self.make_allele_by_consequence( # consequence, hgnc_id, gencode_gene_name) if omim.strip() != '': omim_id = 'OMIM:' + str(omim.strip()) # assume this is declared elsewhere in ontology self.model.addClassToGraph(omim_id, None) # ??? rel is never used # if category.strip() == 'Confirmed DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Probable DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Possible DD gene': # rel = self.self.globaltt['contributes to'] # elif category.strip() == 'Not DD gene': # # TODO negative annotation # continue # assoc = G2PAssoc(graph, self.name, allele_id, omim_id) # TODO 'rel' is assigned to but never used for p in re.split(r';', pubmed_ids): p = p.strip() if p != '': pmid = 'PMID:' + str(p) r = Reference(graph, pmid, self.globaltt['journal article']) r.addRefToGraph() assoc.add_source(pmid) assoc.add_association_to_graph() else: # these are unmapped to a disease id. # note that some match OMIM disease labels # but the identifiers are just not included. # TODO consider mapping to OMIM or DOIDs in other ways LOG.warning("No omim id on line %d\n%s", line_counter, str(row)) unmapped_omim_counter += 1 # TODO hpo phenotypes # since the DDG2P file is not documented, # I don't know what the HPO annotations are actually about # are they about the gene? the omim disease? something else? # So, we wont create associations until this is clarified if not self.test_mode and limit is not None and reader.line_num > limit: break myzip.close() LOG.warning("gene-disorder associations with no omim id: %d", unmapped_omim_counter) LOG.warning("unmapped gene count: %d", unmapped_gene_count) return def make_allele_by_consequence(self, consequence, gene_id, gene_symbol): """ Given a "consequence" label that describes a variation type, create an anonymous variant of the specified gene as an instance of that consequence type. :param consequence: :param gene_id: :param gene_symbol: :return: allele_id """ allele_id = None # Loss of function : Nonsense, frame-shifting indel, # essential splice site mutation, whole gene deletion or any other # mutation where functional analysis demonstrates clear reduction # or loss of function # All missense/in frame : Where all the mutations described in the data # source are either missense or in frame deletions and there is no # evidence favoring either loss-of-function, activating or # dominant negative effect # Dominant negative : Mutation within one allele of a gene that creates # a significantly greater deleterious effect on gene product # function than a monoallelic loss of function mutation # Activating : Mutation, usually missense that results in # a constitutive functional activation of the gene product # Increased gene dosage : Copy number variation that increases # the functional dosage of the gene # Cis-regulatory or promotor mutation : Mutation in cis-regulatory # elements that lies outwith the known transcription unit and # promotor of the controlled gene # Uncertain : Where the exact nature of the mutation is unclear or # not recorded type_id = self.resolve(consequence, mandatory=False) if type_id == consequence: LOG.warning("Consequence type unmapped: %s", str(consequence)) type_id = self.globaltt['sequence_variant'] # make the allele allele_id = ''.join((gene_id, type_id)) allele_id = re.sub(r':', '', allele_id) allele_id = self.make_id(allele_id) # make this a BNode allele_label = ' '.join((consequence, 'allele in', gene_symbol)) self.model.addIndividualToGraph(allele_id, allele_label, type_id) self.geno.addAlleleOfGene(allele_id, gene_id) return allele_id
def process_gaf(self, gaffile, limit, id_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None: # try/except much faster than checking # for dict key membership try: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 except KeyError: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addLabel(gene_id, gene_symbol) model.addType(gene_id, self.globaltt['gene']) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple(gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None and\ syn.split(':')[0] not in self.wont_prefix: syn = syn.strip() LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) if syn != '': model.addSynonym(gene_id, syn) elif syn != '': model.addSynonym(gene_id, syn) # First taxon is for the gene, after the pipe are interacting taxa tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon') # this is a required field but good to safe if tax_curie: geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = self.gaf_eco[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = with_or_from.split('|') phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning("Skipping %s from or with %s", uniprotid, itm) continue # sanity check/conversion on go curie prefix (pfx, lclid) = itm.split(':')[-2:] # last prefix wins if pfx in self.localtt: pfx = self.localtt[pfx] itm = ':'.join((pfx, lclid)) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the mapping download", uniprot_per, uniprot_tot)
def _add_study_provenance(self, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name): """ :param phenotyping_center: str, from self.files['all'] :param colony: str, from self.files['all'] :param project_fullname: str, from self.files['all'] :param pipeline_name: str, from self.files['all'] :param pipeline_stable_id: str, from self.files['all'] :param procedure_stable_id: str, from self.files['all'] :param procedure_name: str, from self.files['all'] :param parameter_stable_id: str, from self.files['all'] :param parameter_name: str, from self.files['all'] :param statistical_method: str, from self.files['all'] :param resource_name: str, from self.files['all'] :return: study bnode """ provenance_model = Provenance(self.graph) model = Model(self.graph) # Add provenance # A study is a blank node equal to its parts study_bnode = self.make_id( "{0}{1}{2}{3}{4}{5}{6}{7}".format( phenotyping_center, colony, project_fullname, pipeline_stable_id, procedure_stable_id, parameter_stable_id, statistical_method, resource_name), '_') model.addIndividualToGraph(study_bnode, None, self.globaltt['study']) # List of nodes linked to study with has_part property study_parts = [] pipeline_curie = 'IMPC-pipe:' + pipeline_stable_id procedure_curie = 'IMPC-proc:' + procedure_stable_id parameter_curie = 'IMPC-param:' + procedure_stable_id + '#' + parameter_stable_id # Add study parts model.addIndividualToGraph(procedure_curie, procedure_name) study_parts.append(procedure_curie) # ? stable or curie study_parts.append(self.resolve(statistical_method)) provenance_model.add_study_parts(study_bnode, study_parts) # Add parameter/measure statement: study measures parameter parameter_label = "{0} ({1})".format(parameter_name, procedure_name) # logging.info("Adding Provenance for %s", project_fullname) model.addIndividualToGraph(parameter_curie, parameter_label) provenance_model.add_study_measure(study_bnode, parameter_curie, object_is_literal=False) # Add Colony colony_bnode = self.make_id("{0}".format(colony), '_') model.addIndividualToGraph(colony_bnode, colony) # Add study agent phenotyping_center_id = self.localtt[phenotyping_center] model.addIndividualToGraph(phenotyping_center_id, phenotyping_center, self.globaltt['organization']) # self.graph model.addTriple(study_bnode, self.globaltt['has_agent'], phenotyping_center_id) # add pipeline and project model.addIndividualToGraph(pipeline_curie, pipeline_name) # self.graph model.addTriple(study_bnode, self.globaltt['part_of'], pipeline_curie) if project_fullname in self.localtt: project_fullname_id = self.localtt[project_fullname] else: project_fullname_id = self.resolve(project_fullname) model.addIndividualToGraph(project_fullname_id, project_fullname, self.globaltt['project']) # self.graph model.addTriple(study_bnode, self.globaltt['part_of'], project_fullname_id) return study_bnode
def _process_QTLs_genomic_location( self, raw, taxon_id, build_id, build_label, limit=None): """ This method Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 geno = Genotype(g) # assume that chrs get added to the genome elsewhere # genome_id = geno.makeGenomeID(taxon_id) # TODO unused eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info("Processing QTL locations for %s", taxon_id) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") # bad_attr_flag = False # TODO unused for row in reader: line_counter += 1 if re.match(r'^#', ' '.join(row)): continue (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row # Chr.Z Animal QTLdb Production_QTL 33954873 34023581 . . . # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01" # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for a in attr_items: if not re.search(r'=', a): # bad_attr_flag = True # TODO unused # remove this attribute from the list bad_attrs.add(a) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.testMode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait qtl_id = 'AQTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, geno.genoparts['QTL']) geno.addTaxon(taxon_id, qtl_id) trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(g, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( g, pub_id, Reference.ref_types['journal_article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( g, self.name, qtl_id, trait_id, model.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): s = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in s: s = re.sub(r',', '.', s) if s.isnumeric(): score = float(s) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_id, 'CHR') chrom_in_build_id = \ makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(g, qtl_id, None, geno.genoparts['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_id) qtl_feature.addFeatureToGraph() if not self.testMode and \ limit is not None and line_counter > limit: break logger.warning("Bad attribute flags in this file") logger.info("Done with QTL genomic mappings for %s", taxon_id) return
def _process_data(self, src_key, limit=None): raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index( 'marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index( 'phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index( 'allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index( 'strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index( 'pipeline_stable_id')].strip() procedure_stable_id = row[col.index( 'procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index( 'parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index( 'statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-' + re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info("Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:' + strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group( 1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:' + re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC(vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple(colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id((colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC(vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype(genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join( (re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype(sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts(genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender # sometimes phenotype ids are missing. (about 711 early 2020) if mp_term_id is None or mp_term_id == '': LOG.warning("No phenotype id specified for row %d", reader.line_num) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and a parameter tested assoc = G2PAssoc(graph, self.name, sex_qualified_genotype_id, mp_term_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join( (mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join( (mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_QTLs_genetic_location( self, raw, taxon_id, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) model = Model(g) eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info( "Processing genetic location for %s from %s", taxon_id, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = 'AQTL:'+qtl_id.strip() trait_id = 'AQTLTrait:'+trait_id.strip() # Add QTL to graph feature = Feature(g, qtl_id, qtl_symbol, geno.genoparts['QTL']) feature.addTaxonToFeature(taxon_id) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_id, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_id) chrom_in_build_id = makeChromID( chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = \ [int(float(x.strip())) for x in re.split(r'-', range_cm)] else: logger.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, geno.genoparts['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '') if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided # and gene_id is an integer, it's NCBI if (gene_id_src == 'NCBIgene' or gene_id_src == '') and \ gene_id.strip().isdigit() : gene_id = 'NCBIGene:' + gene_id.strip() # we will expect that these labels provided elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = \ '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(g, pub_id) elif pubmed_id != '': pub_id = 'PMID:'+pubmed_id.strip() reference = Reference(g, pub_id, Reference.ref_types['journal_article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( g, self.name, qtl_id, trait_id, model.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': s = re.sub(r'<', '', p_values) s = re.sub(r',', '.', s) # international notation if s.isnumeric(): score = float(s) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( g, self.name, dbsnp_id, trait_id, model.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': s = re.sub(r'<', '', p_values) s = re.sub(r',', '.', s) if s.isnumeric(): score = float(s) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.testMode and \ limit is not None and line_counter > limit: break logger.info("Done with QTL genetic info") return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph src_key = 'genes' geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files[src_key]['file'])) col = self.files[src_key]['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(reader) if not self.check_fileheader(col, row): pass for row in reader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() status = row[col.index('status')].strip() # 41622 Approved & 1752 Entry Withdrawn location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index('pubmed_id')].strip( '"') # pipe separated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe separated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] # agr = row[col.index('agr')] if status != 'Approved': self.withdrawn[hgnc_id] = symbol continue if (self.test_mode and entrez_id != '' and entrez_id not in self.gene_ids): continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id, old_id_category=blv.terms['Gene']) elif symbol[ -1] == '@': # 10) region (HOX), RNA cluster, gene (PCDH) continue else: gene_type_id = self.resolve(locus_type, mandatory=False) if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.taxon_curie, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): pmid = pubmed_id.strip() if pmid is not None and pmid != '': graph.addTriple('PMID:' + pmid, self.globaltt['is_about'], hgnc_id) # add the default taxon to the gene graph.addTriple(hgnc_id, self.globaltt['in taxon'], self.taxon_curie) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and chr_match.groups(): chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.taxon_curie, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and band_match.groups(): band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.taxon_curie, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ reader.line_num > limit: break
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info( "Processing Monarch OMIA Animal disease-phenotype associations") src_key = 'omia_d2p' # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f) ] col = self.files[src_key]['columns'] # reusable initial code generator # for c in col: # print( # '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()") for filename in file_list: LOG.info("Processing %s", filename) count_missing = 0 bad_rows = list() fname = '/'.join((mypath, filename)) with open(fname, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') fileheader = next(filereader) if fileheader != col: LOG.error('Expected %s to have columns: %s', fname, col) LOG.error('But Found %s to have columns: %s', fname, fileheader) raise AssertionError( 'Incomming data headers have changed.') for row in filereader: if len(row) != len(col): LOG.info("Not enough cols %d in %s - please fix", len(row), filename) continue disease_num = row[col.index('Disease ID')].strip() species_id = row[col.index('Species ID')].strip() breed_name = row[col.index('Breed Name')].strip() # variant = row[col.index('Variant')] # inheritance = row[col.index('Inheritance')] phenotype_id = row[col.index('Phenotype ID')].strip() # phenotype_name = row[col.index('Phenotype Name')] entity_id = row[col.index('Entity ID')].strip() entity_name = row[col.index('Entity Name')] quality_id = row[col.index('Quality ID')].strip() quality_name = row[col.index('Quality Name')] # related_entity_id = row[col.index('Related Entity ID')] # related_entity_name = row[col.index('Related Entity Name')] # abnormal_id = row[col.index('Abnormal ID')] # abnormal_name = row[col.index('Abnormal Name')] # phenotype_desc = row[col.index('Phenotype Desc')] assay = row[col.index('Assay')].strip() # frequency = row[col.index('Frequency')] pubmed_id = row[col.index('Pubmed ID')].strip() phenotype_description = row[col.index('Pub Desc')].strip() curator_notes = row[col.index('Curator Notes')].strip() # date_created = row[col.index('Date Created')] if phenotype_id == '': # LOG.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:' + disease_num if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(graph, self.name, disease_id, phenotype_id) if pubmed_id != '': for pnum in re.split(r'[,;]', pubmed_id): pnum = re.sub(r'[^0-9]', '', pnum) pmid = 'PMID:' + pnum assoc.add_source(pmid) else: assoc.add_source('/'.join( (self.curie_map['OMIA'] + disease_num, species_id))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription(aid, breed_name + ' [observed in]') if assay != '': model.addDescription(aid, assay + ' [assay]') if curator_notes != '': model.addComment(aid, curator_notes) if entity_id != '' or quality_id != '': LOG.info("EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: LOG.warning( "We are missing %d of %d D2P annotations from id %s", count_missing, filereader.line_num - 1, filename) LOG.warning("Bad rows:\n%s", '\n'.join([str(x) for x in bad_rows])) # finish loop through all files return
def _get_chrbands(self, limit, taxon, genome_id=None): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :param: taxon: :param: genome :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) if genome_id is None: genome_id = geno.makeGenomeID( taxon_id) # makes a blank node allways geno.addGenome(taxon_id, genome_label, genome_id) model.addOWLPropertyClassRestriction(genome_id, self.globaltt['in taxon'], taxon_id) placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # currently unused patterns # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' col = ['chrom', 'start', 'stop', 'band', 'rtype'] with gzip.open(myfile, 'rb') as reader: for line in reader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#': continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') chrom = row[col.index('chrom')] band = row[col.index('band')] rtype = row[col.index('rtype')] # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = re.match(placed_scaffold_pattern + r'$', chrom) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern # chrom = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold LOG.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid + band maplocclass_label = makeChromLabel(chrom + band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph(maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband'] ]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info LOG.info("feature type %s != chr band", region_type_id) else: LOG.info('staining type not found for: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of", maplocclass_id, "=", parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file for prnt in parents: parent = prnt.strip() if parent is None or parent == "": continue pclassid = cclassid + parent # class chr parts pclass_label = makeChromLabel(chrom + parent, genome_label) rti = getChrPartTypeByNotation(parent, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if prnt != parents[-1]: grandparent = 1 + parents.index(prnt) pid = cclassid + parents[grandparent] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: graph = self.testgraph else: graph = self.graph model = Model(graph) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(graph) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:' + build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat' ]: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=") for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:' + attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution=' + sub if ins is not None: desc = 'insertion=' + ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:' + name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: model.addSynonym(fid, name) if desc is not None: model.addDescription(fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: model.addSynonym(fid, other_name) if feature_type_label == 'gene': ftype_id = self.resolve(biotype) else: # so far, they all come with SO label syntax. resolve if need be. ftype_id = self.globaltt[feature_type_label] chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) feature = Feature(graph, fid, flabel, ftype_id) feature.addFeatureStartLocation(start, chr_id, strand) feature.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True feature.addFeatureToGraph(True, None, feature_is_class) if note is not None: model.addDescription(fid, note) if not self.testMode and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
def _get_orthologs(self, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. Relationships are made between genes here. There is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. For example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a hard-coded filter to select only orthology relationships where one of the pair is in our species of interest (Mouse and Human, for the moment). This will be added as a configurable parameter in the future. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence <panther_id> a DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ LOG.info("getting orthologs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) unprocessed_gene_ids = set() # may be faster to make a set after for src_key in self.files: src_file = '/'.join((self.rawdir, self.files[src_key]['file'])) matchcounter = line_counter = 0 col = self.files[src_key]['columns'] reader = tarfile.open(src_file, 'r:gz') # assume that the first entry is the item fname = reader.getmembers()[0] LOG.info("Parsing %s", fname.name) with reader.extractfile(fname) as csvfile: for line in csvfile: # skip comment lines if re.match(r'^#', line.decode()): LOG.info("Skipping header line") continue line_counter += 1 # a little feedback to the user since there's so many if line_counter % 1000000 == 0: LOG.info("Processed %d lines from %s", line_counter, fname.name) line = line.decode().strip() row = line.split('\t') # parse each row. ancestor_taxons is unused # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 # MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 # LDO Euarchontoglires PTHR15964 thing1 = row[col.index('thing1')].strip() thing2 = row[col.index('thing2')].strip() orthology_class = row[col.index('orthology_class')].strip() # ancestor_taxons = row[col.index('')].strip() panther_id = row[col.index('panther_id')].strip() (species_a, gene_a, protein_a) = thing1.split('|') (species_b, gene_b, protein_b) = thing2.split('|') # skip the entries that don't have homolog relationships # with the test ids if self.test_mode and not ( re.sub(r'UniProtKB=', '', protein_a) in self.test_ids or re.sub(r'UniProtKB=', '', protein_b) in self.test_ids): continue # map the taxon abbreviations to ncbi taxon id numbers taxon_a = self.resolve(species_a).split(':')[1].strip() taxon_b = self.resolve(species_b).split(':')[1].strip() # ###uncomment the following code block # if you want to filter based on taxid of favorite animals # taxids = [9606,10090,10116,7227,7955,6239,8355] # taxids = [9606] #human only # retain only those orthologous relationships to genes # in the specified taxids # using AND will get you only those associations where # gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where # gene1 OR gene2 are in the taxid list (some-filter) if self.tax_ids is not None and \ (taxon_a not in self.tax_ids) and \ (taxon_b not in self.tax_ids): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ### end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub(r'=', ':', gene_a) gene_b = re.sub(r'=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.add(gene_a) gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.add(gene_b) gene_b = clean_gene # a special case here; mostly some rat genes # they use symbols instead of identifiers. will skip if gene_a is None or gene_b is None: continue rel = self.resolve(orthology_class) evidence_id = self.globaltt['phylogenetic evidence'] # add the association and relevant nodes to graph assoc = OrthologyAssoc(graph, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; # assume labels will be taken care of elsewhere model.addClassToGraph(gene_a, None) model.addClassToGraph(gene_b, None) # might as well add the taxon info for completeness graph.addTriple(gene_a, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_a) graph.addTriple(gene_b, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_b) assoc.add_association_to_graph() # note this is incomplete... # it won't construct the full family hierarchy, # just the top-grouping assoc.add_gene_family_to_graph('PANTHER:' + panther_id) if not self.test_mode \ and limit is not None and line_counter > limit: break # make report on unprocessed_gene_ids LOG.info("finished processing %s", src_file) LOG.warning( "The following gene ids were unable to be processed: %s", str(unprocessed_gene_ids))
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Turtle: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym(morphology_term_id, s.strip(), self.globaltt['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym(morphology_term_id, s.strip(), self.globaltt['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def process_gene_interaction(self, limit): """ The gene interaction file includes identified interactions, that are between two or more gene (products). In the case of interactions with >2 genes, this requires creating groups of genes that are involved in the interaction. From the wormbase help list: In the example WBInteraction000007779 it would likely be misleading to suggest that lin-12 interacts with (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60 ALONE; the observation in the paper; see Table V in paper PMID:15990876 was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress the "multivulva" phenotype induced synthetically by simultaneous perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele). So this is necessarily a three-gene interaction. Therefore, we can create groups of genes based on their "status" of Effector | Effected. Status: IN PROGRESS :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['gene_interaction']['file'])) if self.testMode: graph = self.testgraph else: graph = self.graph model = Model(graph) logger.info("Processing gene interaction associations") line_counter = 0 with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar="'") for row in filereader: line_counter += 1 if re.match(r'#', ''.join(row)): continue (interaction_num, interaction_type, interaction_subtype, summary, citation) = row[0:5] # print(row) interaction_id = 'WormBase:' + interaction_num # TODO deal with subtypes interaction_type_id = None if interaction_type == 'Genetic': interaction_type_id = self.globaltt[ 'genetically interacts with'] elif interaction_type == 'Physical': interaction_type_id = self.globaltt[ 'molecularly_interacts_with'] elif interaction_type == 'Regulatory': interaction_type_id = self.globaltt['regulates'] else: logger.info("An interaction type I don't understand %s", interaction_type) num_interactors = (len(row) - 5) / 3 if num_interactors != 2: logger.info( "Skipping interactions with !=2 participants:\n %s", str(row)) continue gene_a_id = 'WormBase:' + row[5] gene_b_id = 'WormBase:' + row[8] if self.testMode \ and gene_a_id not in self.test_ids['gene'] \ and gene_b_id not in self.test_ids['gene']: continue assoc = InteractionAssoc(graph, self.name, gene_a_id, gene_b_id, interaction_type_id) assoc.set_association_id(interaction_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # citation is not a pmid or WBref - get this some other way model.addDescription(assoc_id, summary) if not self.testMode and limit is not None and line_counter > limit: break return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ logger.info("Processing OMIM to KEGG gene") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.testMode and \ kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-'+kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = model.object_properties['is_marker_for'] assoc = G2PAssoc(g, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are logger.warning('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.testMode) and ( limit is not None and line_counter > limit): break logger.info("Done with OMIM to KEGG gene") return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ ref_types = { 'person': 'foaf:Person', 'journal_article': 'IAO:0000013', 'publication': 'IAO:0000311', # book 'document': 'IAO:0000310', # document??? 'photograph': 'IAO:0000185', 'webpage': 'SIO:000302', } annotation_properties = {'page': 'foaf:page', 'title': 'dc:title'} def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) if ref_type is None: self.ref_type = self.ref_types['document'] else: self.ref_type = ref_type if ref_id is not None and re.match(r'http', ref_id): self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple(subject_id, self.annotation_properties['page'], page_url, object_is_literal=True) return def addTitle(self, subject_id, title): self.graph.addTriple(subject_id, self.annotation_properties['title'], title, object_is_literal=True) return def addRefToGraph(self): n = self.short_citation if n is None: n = self.title if self.ref_url is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) self.model.addLabel(self.ref_url, n) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, n, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ logger.info("Processing ortholog classes") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.testMode and \ orthology_class_id not in \ self.test_ids['orthology_classes']: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = OrthologyAssoc.terms['gene_family'] model.addClassToGraph(orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: model.addSynonym(orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels)-1] model.addDescription(orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: model.addXref(orthology_class_id, 'EC:'+ecm) if not self.testMode and \ limit is not None and line_counter > limit: break logger.info("Done with ortholog classes") return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("%s is not a graph", graph) # assert ref_id is not None self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_type[:4] not in ('IAO:', 'SIO:'): LOG.warning("Got Pub ref type of: %s", ref_type) if ref_id is not None and ref_id[:4] == 'http': self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple( subject_id, self.globaltt['page'], # foaf:page not <sio:web page> page_url, object_is_literal=True) return def addTitle(self, subject_id, title): if title is not None and title != '': self.graph.addTriple(subject_id, self.globaltt['title (dce)'], title, object_is_literal=True) return def addRefToGraph(self): cite = self.short_citation if cite is None and self.title is not None: cite = self.title if self.ref_url is not None: if self.title is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) if cite is not None: self.model.addLabel(self.ref_url, cite) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true LOG.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for auth in self.author_list: # gu.addTriple( # graph, self.ref_id, self.props['has_author'], auth, True) return
class Dataset: """ This class produces metadata about a dataset that is compliant with the HCLS dataset specification: https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4 Summary level: The summary level provides a description of a dataset that is independent of a specific version or format. (e.g. the Monarch ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER] Version level: The version level captures version-specific characteristics of a dataset. (e.g. the 01-02-2018 ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP] Distribution level: The distribution level captures metadata about a specific form and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is a [distribution level resource] for each different downloadable file we emit, i.e. one for the TTL file, one for the ntriples file, etc. CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format] We write out at least the following triples: SUMMARY LEVEL TRIPLES: [summary level resource] - rdf:type -> dctypes:Dataset [summary level resource] - dct:title -> title (literal) [summary level resource] - dct:description -> description (literal) (use docstring from Source class) [summary level resource] - dcterms:source -> [source web page, e.g. omim.org] [summary level resource] - schemaorg:logo -> [source logo IRI] [summary level resource] - dct:publisher -> monarchinitiative.org n.b: about summary level resource triples: -- HCLS spec says we "should" link to our logo and web page, but I'm not, because it would confuse the issue of whether we are pointing to our logo/page or the logo/page of the data source for this ingest. Same below for [version level resource] and [distibution level resource] - I'm not linking to our page/logo down there either. - spec says we "should" include summary level triples describing Update frequency and SPARQL endpoint but I'm omitting this for now, because these are not clearly defined at the moment VERSION LEVEL TRIPLES: [version level resource] - rdf:type -> dctypes:Dataset [version level resource] - dct:title -> version title (literal) [version level resource] - dct:description -> version description (literal) [version level resource] - dct:created -> ingest timestamp [ISO 8601 compliant] [version level resource] - pav:version -> ingest timestamp (same one above) [version level resource] - dct:creator -> monarchinitiative.org [version level resource] - dct:publisher -> monarchinitiative.org [version level resource] - dct:isVersionOf -> [summary level resource] [version level resource] - dcterms:source -> [source file 1 IRI] [version level resource] - dcterms:source -> [source file 2 IRI] ... [source file 1 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] [source file 2 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] ... [version level resource] - pav:createdWith -> [Dipper github URI] [version level resource] - void:dataset -> [distribution level resource] [version level resource] - cito:citesAsAuthoriy -> [citation id 1] [version level resource] - cito:citesAsAuthoriy -> [citation id 2] [version level resource] - cito:citesAsAuthoriy -> [citation id 3] n.b: about version level resource triples: - spec says we "should" include Date of issue/dct:issued triple, but I'm not because it is redundant with this triple above: [version level resource] - dct:created -> time stamp and would introduce ambiguity and confusion if the two disagree. Same below for [distribution level resource] - dct:created -> tgiime stamp below Also omitting: - triples linking to our logo and page, see above. - License/dct:license triple, because we will make this triple via the [distribution level resource] below - Language/dct:language triple b/c it seems superfluous. Same below for [distribution level resource] - no language triple. - [version level resource] - pav:version triple is also a bit redundant with the pav:version triple below, but the spec requires both these triples - I'm omitting the [version level resource] -> pav:previousVersion because Dipper doesn't know this info for certain at run time. Same below for [distribution level resource] - pav:previousVersion. DISTRIBUTION LEVEL TRIPLES: [distribution level resource] - rdf:type -> dctypes:Dataset [distribution level resource] - rdf:type -> dcat:Distribution [distribution level resource] - dct:title -> distribution title (literal) [distribution level resource] - dct:description -> distribution description (lit.) [distribution level resource] - dct:created -> ingest timestamp[ISO 8601 compliant] [distribution level resource] - pav:version -> ingest timestamp (same as above) [distribution level resource] - dct:creator -> monarchinitiative.org [distribution level resource] - dct:publisher -> monarchinitiative.org [distribution level resource] - dct:license -> [license info, if available otherwise indicate unknown] [distribution level resource] - dcterms:rights -> [data rights IRI] [distribution level resource] - pav:createdWith -> [Dipper github URI] [distribution level resource] - dct:format -> [IRI of ttl|nt|whatever spec] [distribution level resource] - dct:downloadURL -> [ttl|nt URI] [distribution level resource] - void:triples -> [triples count (literal)] [distribution level resource] - void:entities -> [entities count (literal)] [distribution level resource] - void:distinctSubjects -> [subject count (literal)] [distribution level resource] - void:distinctObjects -> [object count (literal)] [distribution level resource] - void:properties -> [properties count (literal)] ... n.b: about distribution level resource triples: - omitting Vocabularies used/void:vocabulary and Standards used/dct:conformTo triples, because they are described in the ttl file - also omitting Example identifier/idot:exampleIdentifier and Example resource/void:exampleResource, because we don't really have one canonical example of either - they're all very different. - [distribution level resource] - dct:created should have the exact same time stamp as this triple above: [version level resource] - dct:created -> time stamp - this [distribution level resource] - pav:version triple should have the same object as [version level resource] - pav:version triple above - Data source provenance/dct:source triples are above in the [version level resource] - omitting Byte size/dct:byteSize, RDF File URL/void:dataDump, and Linkset/void:subset triples because they probably aren't necessary for MI right now - these triples "should" be emitted, but we will do this in a later iteration: # of classes void:classPartition IRI # of literals void:classPartition IRI # of RDF graphs void:classPartition IRI Note: Do not use blank nodes in the dataset graph. This dataset graph is added to the main Dipper graph in Source.write() like so $ mainGraph = mainGraph + datasetGraph which apparently in theory could lead to blank node ID collisions between the two graphs. Note also that this implementation currently does not support producing metadata for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is currently not being used for any ingests, so this isn't a problem. There was talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which would probably require adding support here for StreamedGraph's. """ def __init__( self, identifier, data_release_version, ingest_name, ingest_title, ingest_url, ingest_logo=None, ingest_description=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None, distribution_type='ttl', dataset_curie_prefix='MonarchArchive'): if graph_type is None: self.graph = RDFGraph(None, ":".join([dataset_curie_prefix, identifier])) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, ":".join( [dataset_curie_prefix, identifier]), file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, ':'.join([dataset_curie_prefix, identifier])) if data_release_version is not None: self.data_release_version = data_release_version else: self.data_release_version = datetime.today().strftime("%Y%m%d") self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.identifier = ':'.join([dataset_curie_prefix, identifier]) self.citation = set() self.ingest_name = ingest_name self.ingest_title = ingest_title if self.ingest_title is None: self.ingest_title = ":".join([dataset_curie_prefix, identifier]) self.ingest_url = ingest_url self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo self.ingest_description = ingest_description self.date_issued = None self.license_url = license_url self.data_rights = data_rights self.distribution_type = distribution_type # set HCLS resource CURIEs self.summary_level_curie = ':'.join( [dataset_curie_prefix, '#' + identifier]) self.version_level_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/#' + identifier self.distribution_level_turtle_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/rdf/' + \ identifier + "." + self.distribution_type # The following might seem a little odd, but we need to set downloadURLs this # way in order for them to point to where they will end up in archive.MI.org as # of Sept 2019. URL is: # https://archive.MI.org/[release version]/[dist type]/[source].[dist type] self.download_url = \ self.curie_map.get("MonarchArchive") + self.data_release_version + \ "/rdf/" + self.ingest_name + "." + self.distribution_type self._set_summary_level_triples() self._set_version_level_triples() self._set_distribution_level_triples() def _set_summary_level_triples(self): self.model.addType(self.summary_level_curie, self.globaltt['Dataset']) self.graph.addTriple(self.summary_level_curie, self.globaltt['title'], self.ingest_title, True) self.model.addTriple(self.summary_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) self.model.addTriple(self.summary_level_curie, "schemaorg:logo", self.ingest_logo) self.graph.addTriple(self.summary_level_curie, self.globaltt['identifier'], self.summary_level_curie) if self.ingest_url is not None: self.graph.addTriple(self.summary_level_curie, self.globaltt["Source (dct)"], self.ingest_url) if self.ingest_description is not None: self.model.addDescription(self.summary_level_curie, self.ingest_description) def _set_version_level_triples(self): self.model.addType(self.version_level_curie, self.globaltt['Dataset']) self.graph.addTriple( self.version_level_curie, self.globaltt['title'], self.ingest_title + " Monarch version " + self.data_release_version, True) if self.ingest_description is not None: self.model.addDescription(self.version_level_curie, self.ingest_description) self.graph.addTriple( self.version_level_curie, self.globaltt['created'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple( self.version_level_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple(self.version_level_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['isVersionOf'], self.summary_level_curie, object_is_literal=False) self.graph.addTriple(self.version_level_curie, self.globaltt['distribution'], self.distribution_level_turtle_curie, object_is_literal=False) def _set_distribution_level_triples(self): self.model.addType(self.distribution_level_turtle_curie, self.globaltt['Dataset']) self.model.addType(self.distribution_level_turtle_curie, self.globaltt['distribution']) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['title'], self.ingest_title + " distribution " + self.distribution_type, True) if self.ingest_description is not None: self.model.addDescription(self.distribution_level_turtle_curie, self.ingest_description) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['created'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['created_with'], "https://github.com/monarch-initiative/dipper") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['format'], "https://www.w3.org/TR/turtle/") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['downloadURL'], self.download_url) if self.license_url is None: self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['license'], 'https://project-open-data.cio.gov/unknown-license/') else: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['license'], self.license_url) if self.data_rights is not None: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['rights'], self.data_rights) self._declare_as_ontology() def set_ingest_source_file_version_num(self, file_iri, version): """ This method sets the version of a remote file or resource that is used in the ingest. It writes this triple: file_iri - 'pav:version' -> version Version is an untyped literal Note: if your version is a date or timestamp, use set_ingest_source_file_version_date() instead :param file_iri: a remote file or resource used in ingest :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD) uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], version, object_is_literal=True) def set_ingest_source_file_version_date(self, file_iri, date, datatype=XSD.date): """ This method sets the version that the source (OMIM, CTD, whatever) uses to refer to this version of the remote file/resource that was used in the ingest It writes this triple: file_iri - 'pav:version' -> date or timestamp Version is added as a literal of datatype XSD date Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source_file_version_retrieved_on(self, file_iri, date, datatype=XSD.date): """ This method sets the date on which a remote file/resource (from OMIM, CTD, etc) was retrieved. It writes this triple: file_iri - 'pav:retrievedOn' -> date or timestamp Version is added as a literal of datatype XSD date by default Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['retrieved_on'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source(self, url, predicate=None, is_object_literal=False): """ This method writes a triple to the dataset graph indicating that the ingest used a file or resource at [url] during the ingest. Triple emitted is version_level_curie dcterms:source [url] This triple is likely to be redundant if Source.get_files() is used to retrieve the remote files/resources, since this triple should also be emitted as files/resources are being retrieved. This method is provided as a convenience method for sources that do their own downloading of files. :param url: a remote resource used as a source during ingest :param predicate: the predicate to use for the triple ["dcterms:source"] from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/) "Use dct:source when the source dataset was used in whole or in part. Use pav:retrievedFrom when the source dataset was used in whole and was not modified from its original distribution. Use prov:wasDerivedFrom when the source dataset was in whole or in part and was modified from its original distribution." :return: None """ if predicate is None: predicate = self.globaltt["Source (dct)"] self.graph.addTriple(self.version_level_curie, predicate, url, object_is_literal=is_object_literal) def get_graph(self): """ This method returns the dataset graph :param :return: dataset graph """ return self.graph def get_license(self): """ This method returns the license info :param :return: license info """ return self.license_url def set_citation(self, citation_id): """ This method adds [citaton_id] argument to the set of citations, and also adds a triple indicating that version level cito:citesAsAuthority [citation_id] :param: citation_id :return: none """ self.citation.add(citation_id) self.graph.addTriple(self.version_level_curie, self.globaltt['citesAsAuthority'], citation_id) def _declare_as_ontology(self, version_info=None): """ Declare the distribution level IRI as an ontology, and also make triple distribution level IRI - version_iri -> version level IRI TEC: I am not convinced dipper reformatting external data as RDF triples makes an OWL ontology (nor that it should be considered a goal). Proper ontologies are built by ontologists. Dipper reformats data and annotates/decorates it with a minimal set of carefully arranged terms drawn from from multiple proper ontologies. Which allows the whole (dipper's RDF triples and parent ontologies) to function as a single ontology we can reason over when combined in a store such as SciGraph. Including more than the minimal ontological terms in dipper's RDF output constitutes a liability as it allows greater divergence between dipper artifacts and the proper ontologies. :param version_info: a string describing version info for the ontology :return: """ model = Model(self.graph) model.addOntologyDeclaration(self.summary_level_curie) model.addOWLVersionIRI(self.summary_level_curie, self.version_level_curie) if version_info is not None: model.addOWLVersionInfo(self.distribution_level_turtle_curie, version_info) @staticmethod def make_id(long_string, prefix='MONARCH'): """ A method to create DETERMINISTIC identifiers based on a string's digest. currently implemented with sha1 Duplicated from Source.py to avoid circular imports. :param long_string: string to use to generate identifier :param prefix: prefix to prepend to identifier [Monarch] :return: a Monarch identifier """ return ':'.join((prefix, Dataset.hash_id(long_string))) @staticmethod def hash_id(word): # same as graph/GraphUtils.digest_id(wordage) """ Given a string, make a hash Duplicated from Source.py. :param word: str string to be hashed :return: hash of id """ return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
def _process_kegg_disease2gene(self, limit=None): """ This method creates an association between diseases and their associated genes. We are being conservative here, and only processing those diseases for which there is no mapping to OMIM. Triples created: <alternate_locus> is an Individual <alternate_locus> has type <variant_locus> <alternate_locus> is an allele of <gene_id> <assoc_id> has subject <disease_id> <assoc_id> has object <gene_id> :param limit: :return: """ logger.info("Processing KEGG disease to gene") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 geno = Genotype(g) rel = model.object_properties['is_marker_for'] noomimset = set() raw = '/'.join((self.rawdir, self.files['disease_gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, disease_id) = row if self.testMode and gene_id not in self.test_ids['genes']: continue gene_id = 'KEGG-'+gene_id.strip() disease_id = 'KEGG-'+disease_id.strip() # only add diseases for which # there is no omim id and not a grouping class if disease_id not in self.kegg_disease_hash: # add as a class disease_label = None if disease_id in self.label_hash: disease_label = self.label_hash[disease_id] if re.search(r'includ', str(disease_label)): # they use 'including' when it's a grouping class logger.info( "Skipping this association because " + "it's a grouping class: %s", disease_label) continue # type this disease_id as a disease model.addClassToGraph(disease_id, disease_label, 'DOID:4') noomimset.add(disease_id) alt_locus_id = self._make_variant_locus_id(gene_id, disease_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. assoc = G2PAssoc(g, self.name, alt_locus_id, disease_id, rel) assoc.add_association_to_graph() if (not self.testMode) and ( limit is not None and line_counter > limit): break logger.info("Done with KEGG disease to gene") logger.info("Found %d diseases with no omim id", len(noomimset)) return
def _parse_g2p_file(self, limit=None): """ Parse gene to XPO file, currently custom for Monarch :param limit: :return: """ src_key = 'g2p_assertions' geno = Genotype(self.graph) model = Model(self.graph) columns = self.files[src_key]['columns'] raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Gene to XPO associations") with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') # File has headers row = next(reader) if not self.check_fileheader(columns, row): pass for row in reader: gene = row[columns.index('SUBJECT')] gene_label = row[columns.index('SUBJECT_LABEL')] gene_taxon = row[columns.index('SUBJECT_TAXON')] #gene_taxon_label = row[columns.index('SUBJECT_TAXON_LABEL')] phenotype_curie = row[columns.index('OBJECT')] #phenotype_label = row[columns.index('OBJECT_LABEL')] relation = row[columns.index('RELATION')] #relation_label = row[columns.index('RELATION_LABEL')] evidence = row[columns.index('EVIDENCE')] #evidence_label = row[columns.index('EVIDENCE_LABEL')] source = row[columns.index('SOURCE')] #is_defined_by = row[columns.index('IS_DEFINED_BY')] #qualifier = row[columns.index('QUALIFIER')] relation_curie = relation.replace('_', ':') geno.addGene(gene, gene_label) geno.addTaxon(gene_taxon, gene) assoc = G2PAssoc( self.graph, self.name, entity_id=gene, phenotype_id=phenotype_curie, rel=relation_curie ) if evidence: assoc.add_evidence(evidence) if source: model.addType(source, self.globaltt['journal article']) assoc.add_source(source) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ logger.info("Processing genes") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 family = Family(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels # that are delimited, as: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited # (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. # we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split('r;', gene_name) symbollist = re.split(r',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.testMode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() model.addDefinition(gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): model.addSynonym(gene_id, i[1].strip()) if len(gene_stuff) > 2: ko_part = gene_stuff[2] ko_match = re.search(r'K\d+', ko_part) if ko_match is not None and len(ko_match.groups()) == 1: ko = 'KEGG-ko:'+ko_match.group(1) family.addMemberOf(gene_id, ko) if not self.testMode and \ limit is not None and line_counter > limit: break logger.info("Done with genes") return
def _process_trait_mappings(self, raw, src_key, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) col = self.files[src_key]['columns'] with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') header = next(filereader, None) self.check_fileheader(col, header) for row in filereader: line_counter += 1 # need to skip the last line if len(row) != len(col): LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue vto_id = row[col.index('VT')].strip() pto_id = row[col.index('LPT')].strip() cmo_id = row[col.index('CMO')].strip() ato_column = row[col.index('ATO')].strip() # species = row[col.index('Species')].strip() # trait_class = row[col.index('Class')].strip() # trait_type = row[col.index('Type')].strip() # qtl_count = row[col.index('QTL_Count')].strip() ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub( r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def _is_valid(self): # check if sub/obj/rel are none...raise error if self.sub is None: raise ValueError( 'No subject set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.obj is None: raise ValueError( 'No object set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.rel is None: raise ValueError( 'No predicate set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) # Are subject & predicate, either a curie or IRI pfx = self.sub.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Subject for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) pfx = self.rel.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Predicate for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) return True def add_association_to_graph(self): if not self._is_valid(): return self.graph.addTriple(self.sub, self.rel, self.obj) if self.assoc_id is None: self.set_association_id() assert self.assoc_id is not None self.model.addType(self.assoc_id, self.model.globaltt['association']) self.graph.addTriple( self.assoc_id, self.globaltt['association has subject'], self.sub) self.graph.addTriple( self.assoc_id, self.globaltt['association has object'], self.obj) self.graph.addTriple( self.assoc_id, self.globaltt['association has predicate'], self.rel) if self.description is not None: self.model.addDescription(self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for evi in self.evidence: self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi) if self.source is not None and len(self.source) > 0: for src in self.source: # TODO assume that the source is a publication? use Reference class self.graph.addTriple(self.assoc_id, self.globaltt['source'], src) if self.provenance is not None and len(self.provenance) > 0: for prov in self.provenance: self.graph.addTriple( self.assoc_id, self.globaltt['has_provenance'], prov) if self.date is not None and len(self.date) > 0: for dat in self.date: self.graph.addTriple( self.assoc_id,self.globaltt['created_on'], dat, object_is_literal=True) if self.score is not None: self.graph.addTriple( self.assoc_id, self.globaltt['has measurement value'], self.score, True, 'xsd:float') # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_predicate_object( self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple( self.assoc_id, predicate, object_node, True, datatype) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) return # This isn't java, but predecessors favored the use of property decorators # and CamelCase and ... def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id( self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return self.assoc_id def get_association_id(self): if self.assoc_id is None: self.set_association_id() return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return @staticmethod def make_association_id(definedby, sub, pred, obj, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. Note this is equivalent to a RDF blank node :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ items_to_hash = [definedby, sub, pred, obj] if attributes is not None and len(attributes) > 0: items_to_hash += attributes items_to_hash = [x for x in items_to_hash if x is not None] assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash)))) assert assoc_id is not None return assoc_id
def _process_qtls_genomic_location( self, raw, src_key, txid, build_id, build_label, common_name, limit=None): """ This method Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) # assume that chrs get added to the genome elsewhere taxon_curie = 'NCBITaxon:' + txid eco_id = self.globaltt['quantitative trait analysis evidence'] LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") # no header in GFF, so no header checking col = self.files[src_key]['columns'] for row in reader: line_counter += 1 if re.match(r'^#', ' '.join(row)): continue if len(row) != len(col): LOG.warning("Problem parsing in %s row %s\n" "got %s cols but expected %s", raw, row, len(row), len(col)) continue else: # Doing this non-positional mapping for consistency, but I'm not # sure we need to do this for GFF, since columns in GFF are probably # not going to change anytime soon. chromosome = row[col.index('SEQNAME')].strip() # qtl_source = row[col.index('SOURCE')].strip() # qtl_type = row[col.index('FEATURE')].strip() start_bp = row[col.index('START')].strip() stop_bp = row[col.index('END')].strip() # score = row[col.index('SCORE')].strip() strand = row[col.index('STRAND')].strip() # frame = row[col.index('FRAME')].strip() attr = row[col.index('ATTRIBUTE')].strip() example = ''' Chr.Z Animal QTLdb Production_QTL 33954873 34023581... QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01 ''' str(example) # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for attributes in attr_items: if not re.search(r'=', attributes): # remove this attribute from the list bad_attrs.add(attributes) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.test_mode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait based on taxon qtl_id = common_name + 'QTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL']) geno.addTaxon(taxon_curie, qtl_id) # trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(graph, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): scr = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in scr: scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_curie) qtl_feature.addFeatureToGraph() if not self.test_mode and limit is not None and line_counter > limit: break # LOG.warning("Bad attribute flags in this file") # what does this even mean?? LOG.info("Done with QTL genomic mappings for %s", taxon_curie) return
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map return def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic_genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.globaltt['intrinsic_genotype'] self.model.addIndividualToGraph(genotype_id, genotype_label, genotype_type, genotype_description) return def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if allele_type is None: allele_type = self.globaltt['allele'] # TODO is this a good idea? self.model.addIndividualToGraph(allele_id, allele_label, allele_type, allele_description) return def addGene(self, gene_id, gene_label=None, gene_type=None, gene_description=None): ''' genes are classes ''' if gene_type is None: gene_type = self.globaltt['gene'] self.model.addClassToGraph(gene_id, gene_label, gene_type, gene_description) return def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None): # TODO add base type for construct # if (constrcut_type is None): # constrcut_type=self.construct_base_type self.model.addIndividualToGraph(construct_id, construct_label, construct_type, construct_description) return def addDerivesFrom(self, child_id, parent_id): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.graph.addTriple(child_id, self.globaltt['derives_from'], parent_id) return def addSequenceDerivesFrom(self, child_id, parent_id): self.graph.addTriple(child_id, self.globaltt['sequence_derives_from'], parent_id) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_allele_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.globaltt["is_allele_of"] self.graph.addTriple(allele_id, rel_id, gene_id) return def addAffectedLocus(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:has_affected_feature. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.globaltt['has_affected_feature'] self.graph.addTriple(allele_id, rel_id, gene_id) return def addGeneProduct(self, sequence_id, product_id, product_label=None, product_type=None): """ Add gene/variant/allele has_gene_product relationship Can be used to either describe a gene to transcript relationship or gene to protein :param sequence_id: :param product_id: :param product_label: :param product_type: :return: """ if product_label is not None and product_type is not None: self.model.addIndividualToGraph(product_id, product_label, product_type) self.graph.addTriple(sequence_id, self.globaltt['has gene product'], product_id) return def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.globaltt['polypeptide'] self.model.addIndividualToGraph(polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.graph.addTriple(transcript_id, self.globaltt['translates_to'], polypeptide_id) return def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.globaltt['homozygous'] else: zygosity_id = self.globaltt['heterozygous'] if zygosity_id is not None: self.graph.addTriple(vslc_id, self.globaltt['has_zygosity'], zygosity_id) return def addVSLCtoParent(self, vslc_id, parent_id): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :return: """ self.addParts(vslc_id, parent_id, self.globaltt['has_variant_part']) return def addParts(self, part_id, parent_id, part_relationship=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :return: """ if part_relationship is None: part_relationship = self.globaltt['has_part'] # Fail loudly if parent or child identifiers are None if parent_id is None: raise TypeError('Attempt to pass None as parent') elif part_id is None: raise TypeError('Attempt to pass None as child') elif part_relationship is None: part_relationship = self.globaltt['has_part'] self.graph.addTriple(parent_id, part_relationship, part_id) return def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.globaltt['sequence_alteration'] self.model.addIndividualToGraph(sa_id, sa_label, sa_type, sa_description) return def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.globaltt['has_variant_part']) return def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.globaltt['genomic_background'] self.model.addIndividualToGraph(background_id, background_label, background_type, background_description) return def addGenomicBackgroundToGenotype(self, background_id, genotype_id, background_type=None): if background_type is None: background_type = self.globaltt['genomic_background'] self.model.addType(background_id, background_type) self.addParts(background_id, genotype_id, self.globaltt['has_reference_part']) return def addTaxon(self, taxon_id, genopart_id): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :return: """ self.graph.addTriple(genopart_id, self.globaltt['in taxon'], taxon_id) return def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): # for example, add a morphant reagent thingy to the genotype, # assuming it's a extrinsic_genotype self.graph.addTriple(genotype_id, self.globaltt['has_variant_part'], reagent_id) return def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.model.addIndividualToGraph(reagent_id, reagent_label, reagent_type, description) self.graph.addTriple(reagent_id, self.globaltt['targets_gene'], gene_id) return def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdf:label targeted_gene_label dc:description description <reagent_id> GENO:targets_gene <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :return: """ # akin to a variant locus if targeted_gene_id is None: targeted_gene_id = '_' + gene_id + '-' + reagent_id targeted_gene_id = targeted_gene_id.replace(":", "") self.model.addIndividualToGraph(targeted_gene_id, targeted_gene_label, self.globaltt['reagent_targeted_gene'], description) if gene_id is not None: self.graph.addTriple(targeted_gene_id, self.globaltt['is_expression_variant_of'], gene_id) self.graph.addTriple(targeted_gene_id, self.globaltt['is_targeted_by'], reagent_id) return def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.globaltt['targeted_gene_subregion'] self.model.addIndividualToGraph(tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.graph.addTriple(population_id, self.globaltt['has_member_with_allelotype'], member_id) return def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.globaltt['targeted_gene_complement'] self.model.addIndividualToGraph(tgc_id, tgc_label, tgc_type, tgc_description) return def addGenome(self, taxon_num, taxon_label=None, genome_id=None): ncbitaxon = 'NCBITaxon:' + taxon_num if taxon_label is None: if ncbitaxon in self.globaltcid: taxon_label = self.globaltcid[ncbitaxon] else: logging.warning('Add ' + ncbitaxon + ' to global translation table') taxon_label = taxon_id elif ncbitaxon in self.globaltcid and taxon_label != self.globaltcid[ ncbitaxon]: logging.warning('"' + self.globaltcid[ncbitaxon] + '" may need updating from "' + taxon_label + '" in global translation table') logging.warning( '"' + taxon_label + '": " ' + self.globaltcid[ncbitaxon] + '"' + ' may need to be added to a local translation table') genome_label = taxon_label + ' genome' if genome_id is None: genome_id = self.makeGenomeID(taxon_num) self.model.addClassToGraph(genome_id, genome_label, self.globaltt['genome']) return def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.model.addIndividualToGraph(build_id, build_label, self.globaltt['reference_genome']) self.model.addType(build_id, genome_id) if re.match(r'[0-9]+', taxon_id): taxon_id = 'NCBITaxon:' + taxon_id self.addTaxon(taxon_id, build_id) return @staticmethod def makeGenomeID(taxon_id): # scrub off the taxon prefix. put it in base space # TODO: revisit as yet another BNODE? # should never be called if a real genome iri exists # should create the opaque bode and label together # genome_id = re.sub(r'.*\:', '_:', taxon_id) + 'genome' genome_id = '_:' + taxon_id + 'genome' return genome_id def addChromosome(self, chrom, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family(self.graph) # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chrom), tax_id) if tax_label is not None: chr_label = makeChromLabel(chrom, tax_label) else: chr_label = makeChromLabel(chrom) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph(chr_id, chr_label, self.globaltt['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chrom, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chrom, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id) family.addMemberOf(chrinbuild_id, build_id) return def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.model.addClassToGraph(chrom_class_id, chrom_class_label, self.globaltt['chromosome']) return def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph(chr_id, chr_label, self.globaltt['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) # usage dependent, todo: ommit return @staticmethod def make_variant_locus_label(gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip() + '<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if gene_label is None and allele1_label is None and allele2_label is None: LOG.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label def make_experimental_model_with_genotype(self, genotype_id, genotype_label, taxon_id, taxon_label): animal_id = '-'.join((taxon_id, 'with', genotype_id)) animal_id = re.sub(r':', '', animal_id) animal_id = '_:' + animal_id animal_label = ' '.join((genotype_label, taxon_label)) self.model.addIndividualToGraph(animal_id, animal_label, taxon_id) self.graph.addTriple(animal_id, self.globaltt['has_genotype'], genotype_id) return animal_id
def _add_evidence(self, assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode): """ :param assoc_id: assoc curie used to reify a genotype to phenotype association, generated in _process_data() :param eco_id: eco_id as curie, hardcoded in _process_data() :param p_value: str, from self.files['all'] :param percentage_change: str, from self.files['all'] :param effect_size: str, from self.files['all'] :param study_bnode: str, from self.files['all'] :param phenotyping_center: str, from self.files['all'] :return: str, evidence_line_bnode as curie """ evidence_model = Evidence(self.graph, assoc_id) provenance_model = Provenance(self.graph) model = Model(self.graph) # Add line of evidence evidence_line_bnode = self.make_id( "{0}{1}".format(assoc_id, study_bnode), '_') evidence_model.add_supporting_evidence(evidence_line_bnode) model.addIndividualToGraph(evidence_line_bnode, None, eco_id) # Add supporting measurements to line of evidence measurements = {} if p_value is not None or p_value != "": p_value_bnode = self.make_id( "{0}{1}{2}".format(evidence_line_bnode, 'p_value', p_value), '_') model.addIndividualToGraph(p_value_bnode, None, self.globaltt['p-value']) try: measurements[p_value_bnode] = float(p_value) except ValueError: measurements[p_value_bnode] = p_value if percentage_change is not None and percentage_change != '': fold_change_bnode = self.make_id( "{0}{1}{2}".format(evidence_line_bnode, 'percentage_change', percentage_change), '_') model.addIndividualToGraph(fold_change_bnode, None, self.resolve('percentage_change')) measurements[fold_change_bnode] = percentage_change if effect_size is not None or effect_size != "": fold_change_bnode = self.make_id( "{0}{1}{2}".format(evidence_line_bnode, 'effect_size', effect_size), '_') model.addIndividualToGraph(fold_change_bnode, None, self.globaltt['effect size estimate']) measurements[fold_change_bnode] = effect_size if measurements != {}: evidence_model.add_supporting_data(evidence_line_bnode, measurements) # Link evidence to provenance by connecting to study node provenance_model.add_study_to_measurements(study_bnode, measurements.keys()) self.graph.addTriple(evidence_line_bnode, self.globaltt['has_supporting_activity'], study_bnode) return evidence_line_bnode