def _process_pathway_pubmed(self, limit): """ Indicate that a pathway is annotated directly to a paper (is about) via it's pubmed id. :param limit: :return: """ LOG.info("Processing KEGG pathways to pubmed ids") if self.test_mode: graph = self.testgraph else: graph = self.graph raw = '/'.join((self.rawdir, self.files['pathway_pubmed']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (pubmed_id, kegg_pathway_num) = row if self.test_mode and kegg_pathway_num not in self.test_ids['pathway']: continue pubmed_id = pubmed_id.upper() # will look like KEGG-path:map04130 kegg_id = 'KEGG-' + kegg_pathway_num r = Reference(graph, pubmed_id, self.globaltt['journal article']) r.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['is_about'], kegg_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_collection(self, collection_id, label, page): """ This function will process the data supplied internally about the repository from Coriell. Triples: Repository a ERO:collection rdf:label Literal(label) foaf:page Literal(page) :param collection_id: :param label: :param page: :return: """ # ############# BUILD THE CELL LINE REPOSITORY ############# for graph in [self.graph, self.testgraph]: # TODO: How to devise a label for each repository? model = Model(graph) reference = Reference(graph) repo_id = 'CoriellCollection:' + collection_id repo_label = label repo_page = page model.addIndividualToGraph( repo_id, repo_label, self.globaltt['collection']) reference.addPage(repo_id, repo_page) return
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) zp_file = '/'.join((self.rawdir, self.files['zpmap']['file'])) g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file'])) zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file) with open(g2p_file, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (internal_id, symbol, gene_id, subterm1_id, subterm1_label, pc_rel_id, pc_rel_label, superterm1_id, superterm1_label, quality_id, quality_name, modifier, subterm2_id, subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id, superterm2_label, fish_id, fish_label, start_stage, end_stage, environment, pub_id, figure_id, unknown_field) = row zp_id = zfin_parser._map_sextuple_to_phenotype( superterm1_id, subterm1_id, quality_id, superterm2_id, subterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, Reference.ref_types['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence('ECO:0000059') assoc.add_association_to_graph()
def _make_association(self, subject_id, object_id, rel_id, pubmed_ids): """ Make a reified association given an array of pubmed identifiers. Args: :param subject_id id of the subject of the association (gene/chem) :param object_id id of the object of the association (disease) :param rel_id relationship id :param pubmed_ids an array of pubmed identifiers Returns: :return None """ # TODO pass in the relevant Assoc class rather than relying on G2P assoc = G2PAssoc(self.g, self.name, subject_id, object_id, rel_id) if pubmed_ids is not None and len(pubmed_ids) > 0: eco = self._get_evidence_code('TAS') for pmid in pubmed_ids: r = Reference( self.g, pmid, Reference.ref_types['journal_article']) r.addRefToGraph() assoc.add_source(pmid) assoc.add_evidence(eco) assoc.add_association_to_graph() return
def _make_association(self, subject_id, object_id, rel_id, pubmed_ids): """ Make a reified association given an array of pubmed identifiers. Args: :param subject_id id of the subject of the association (gene/chem) :param object_id id of the object of the association (disease) :param rel_id relationship id :param pubmed_ids an array of pubmed identifiers Returns: :return None """ # TODO pass in the relevant Assoc class rather than relying on G2P assoc = G2PAssoc(self.graph, self.name, subject_id, object_id, rel_id) if pubmed_ids is not None and len(pubmed_ids) > 0: for pmid in pubmed_ids: ref = Reference( self.graph, pmid, self.globaltt['journal article']) ref.addRefToGraph() assoc.add_source(pmid) assoc.add_evidence(self.globaltt['traceable author statement']) assoc.add_association_to_graph() return
def _parse_curated_chem_disease(self, limit): line_counter = 0 file_path = '/'.join((self.rawdir, self.static_files['publications']['file'])) gu = GraphUtils(curie_map.get()) with open(file_path, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # catch comment lines if re.match('^#', ' '.join(row)): continue line_counter += 1 self._check_list_len(row, 10) (pub_id, disease_label, disease_id, disease_cat, evidence, chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row rel_id = self._get_relationship_id(evidence) chem_id = 'MESH:'+chem_id gu.addClassToGraph(self.g, chem_id, chem_label) gu.addClassToGraph(self.g, disease_id, None) if pub_id != '': pub_id = 'PMID:'+pub_id r = Reference(pub_id, Reference.ref_types['journal_article']) r.addRefToGraph(self.g) else: pub_id = None self._make_association('MESH:'+chem_id, disease_id, rel_id, ['PMID:'+pub_id]) if not self.testMode and limit is not None and line_counter >= limit: break return
def process_pub_xrefs(self, limit=None): raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing publication xrefs") line_counter = 0 with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (wb_ref, xref) = row # WBPaper00000009 pmid8805<BR> # WBPaper00000011 doi10.1139/z78-244<BR> # WBPaper00000012 cgc12<BR> if self.testMode and wb_ref not in self.test_ids['pub']: continue ref_id = 'WormBase:'+wb_ref xref_id = r = None xref = re.sub(r'<BR>', '', xref) xref = xref.strip() if re.match(r'pmid', xref): xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref) r = Reference( xref_id, Reference.ref_types['journal_article']) elif re.search(r'[\(\)\<\>\[\]\s]', xref): continue elif re.match(r'doi', xref): xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip()) r = Reference(xref_id) elif re.match(r'cgc', xref): # TODO not sure what to do here with cgc xrefs continue else: # logger.debug("Other xrefs like %s", xref) continue if xref_id is not None: r.addRefToGraph(g) gu.addSameIndividual(g, ref_id, xref_id) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _add_variant_trait_association(self, variant_id, mapped_trait_uri, efo_ontology, pubmed_id, description=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for trait in re.split(r',', mapped_trait_uri): trait = trait.strip() trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_", "EFO:") phenotype_query = """ SELECT ?trait WHERE {{ <{0}> rdfs:subClassOf+ <http://www.ebi.ac.uk/efo/EFO_0000651> . <{0}> rdfs:label ?trait . }} """.format(trait) query_result = efo_ontology.query(phenotype_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_curie): model.addClassToGraph( trait_curie, list(query_result)[0][0], 'UPHENO:0001001') pubmed_curie = 'PMID:' + pubmed_id ref = Reference( g, pubmed_curie, Reference.ref_types['journal_article']) ref.addRefToGraph() assoc = G2PAssoc( g, self.name, variant_id, trait_curie, model.object_properties['contributes_to']) assoc.add_source(pubmed_curie) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) if description is not None: assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) if trait_curie is not None: assoc.add_association_to_graph()
def parse(self, limit=None): count = 0 for num in range(10, 100): fuzzy_gene = "MGI:{0}*".format(num) gene = "MGI:{0}".format(num) service = Service("http://www.mousemine.org/mousemine/service") logging.getLogger('Model').setLevel(logging.ERROR) logging.getLogger('JSONIterator').setLevel(logging.ERROR) query = service.new_query("OntologyAnnotation") query.add_constraint("subject", "SequenceFeature") query.add_constraint("ontologyTerm", "MPTerm") query.add_view( "subject.primaryIdentifier", "subject.symbol", "subject.sequenceOntologyTerm.name", "ontologyTerm.identifier", "ontologyTerm.name", "evidence.publications.pubMedId", "evidence.comments.type", "evidence.comments.description" ) query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC") query.add_constraint("subject.organism.taxonId", "=", self.txid, code="A") query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B") query.add_constraint( "subject.primaryIdentifier", "CONTAINS", gene, code="C") query.outerjoin("evidence.comments") for row in query.rows(): mgi_curie = row["subject.primaryIdentifier"] mp_curie = row["ontologyTerm.identifier"] pub_curie = "PMID:{0}".format(row["evidence.publications.pubMedId"]) assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie) if row["evidence.publications.pubMedId"]: reference = Reference( self.graph, pub_curie, self.globaltt['journal article']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence(self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() if not count % 10 and count != 0: count_from = count - 10 LOG.info( "%s processed ids from MGI:%i* to MGI:%i*", datetime.datetime.now(), count_from, count) count += 1 if limit and count >= limit: break return
def _process_article_row(self, row): # don't bother in test mode if self.testMode: return iarticle_id = self._make_internal_id('article', row['article_id']) self.id_hash['article'][row['article_id']] = iarticle_id rtype = None if row['journal'] != '': rtype = Reference.ref_types['journal_article'] r = Reference(iarticle_id, rtype) if row['title'] is not None: r.setTitle(row['title'].strip()) if row['year'] is not None: r.setYear(row['year']) r.addRefToGraph(self.g) if row['pubmed_id'] is not None: pmid = 'PMID:'+str(row['pubmed_id']) self.id_hash['article'][row['article_id']] = pmid self.gu.addSameIndividual(self.g, iarticle_id, pmid) self.gu.addComment(self.g, pmid, iarticle_id) return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ model = Model(self.graph) record['relation']['id'] = self.resolve("has phenotype") # define the triple gene = record['subject']['id'] relation = record['relation']['id'] phenotype = record['object']['id'] # instantiate the association g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=phenotype, pred=relation) # add the references references = record['evidence']['has_supporting_reference'] # created RGDRef prefix in curie map to route to proper reference URL in RGD references = [ x.replace('RGD', 'RGDRef') if 'PMID' not in x else x for x in references] if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference( self.graph, references[0], self.globaltt['publication'] ) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list # This seems to be specific to this source and # there could be non-equivalent references in this list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add the date created on g2p_assoc.add_date(date=record['date']) g2p_assoc.add_evidence(self.resolve(record['evidence']['type'])) # ?set where? g2p_assoc.add_association_to_graph() return
def _get_pubs(self, entry, g): """ Extract mentioned publications from the reference list :param entry: :return: """ ref_to_pmid = {} du = DipperUtil() entry_num = entry['mimNumber'] gu = GraphUtils(curie_map.get()) if 'referenceList' in entry: reflist = entry['referenceList'] for r in reflist: if 'pubmedID' in r['reference']: pub_id = 'PMID:' + str(r['reference']['pubmedID']) ref = Reference(pub_id, Reference.ref_types['journal_article']) else: # make blank node for internal reference pub_id = '_OMIM' + str(entry_num) + 'ref' + str(r['reference']['referenceNumber']) if self.nobnodes: pub_id = ':' + pub_id ref = Reference(pub_id) title = author_list = source = citation = None if 'title' in r['reference']: title = r['reference']['title'] ref.setTitle(title) if 'authors' in r['reference']: author_list = r['reference']['authors'] ref.setAuthorList(author_list) citation = re.split('\.\,', author_list)[0] + ' et al' if 'source' in r['reference']: source = r['reference']['source'] citation = '; '.join(du.flatten([citation, title, source])) ref.setShortCitation(citation) ref.addRefToGraph(g) ref_to_pmid[r['reference']['referenceNumber']] = pub_id # add is_about for the pub omim_id = 'OMIM:'+str(entry_num) gu.addTriple(g, omim_id, gu.object_properties['mentions'], pub_id) return ref_to_pmid
def _process_pathway_pubmed(self, limit): """ Indicate that a pathway is annotated directly to a paper (is about) via it's pubmed id. :param limit: :return: """ logger.info("Processing KEGG pathways to pubmed ids") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['pathway_pubmed']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (pubmed_id, kegg_pathway_num) = row if self.testMode and \ kegg_pathway_num not in self.test_ids['pathway']: continue pubmed_id = pubmed_id.upper() # will look like KEGG-path:map04130 kegg_id = 'KEGG-'+kegg_pathway_num r = Reference( pubmed_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) gu.addTriple(g, pubmed_id, GraphUtils.object_properties['is_about'], kegg_id) if not self.testMode and \ limit is not None and line_counter > limit: break return
def _parse_curated_chem_disease(self, limit): model = Model(self.graph) line_counter = 0 file_path = '/'.join( (self.rawdir, self.static_files['publications']['file'])) with open(file_path, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # catch comment lines if re.match(r'^#', ' '.join(row)): continue line_counter += 1 self._check_list_len(row, 10) (pub_id, disease_label, disease_id, disease_cat, evidence, chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row if disease_id.strip() == '' or chem_id.strip() == '': continue rel_id = self.resolve(evidence) chem_id = 'MESH:' + chem_id model.addClassToGraph(chem_id, chem_label) model.addClassToGraph(disease_id, None) if pub_id != '': pub_id = 'PMID:' + pub_id ref = Reference( self.graph, pub_id, ref_type=self.globaltt['journal article']) ref.addRefToGraph() pubids = [pub_id] else: pubids = None self._make_association(chem_id, disease_id, rel_id, pubids) if not self.test_mode and limit is not None and line_counter >= limit: break return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids): continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = {'variants': set(), 'genes': set()} # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:'+str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:'+i.strip() pubmed_ids.append(pmid) r = Reference(pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts gu.addClassToGraph(g, mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: '+research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class gu.addIndividualToGraph( g, strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? gu.makeLeader(g, strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology gu.addClassToGraph(g, pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(self.name, mgi_allele_id, pid, gu.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph(g) else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and ( limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_'+gene+'-VL' vl_id = re.sub(r':', '', vl_id) if self.nobnodes: vl_id = ':'+vl_id vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = '_'+re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r':', '', gvc_id) if self.nobnodes: gvc_id = ':'+gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ '_' + re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) if self.nobnodes: bkgd_id = ':'+bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified ('+s+')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for "+s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) gu.addTriple( g, s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass gu.loadProperties( g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadProperties( g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadAllProperties(g) logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return
def _process_phenotype_tab(self, raw, limit): if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator) = row disease_id = db + ":" + str(num) if self.testMode and disease_id.strip() not in config.get_config()['test_ids']['disease']: continue # logger.info('adding %s', disease_id) gu.addClassToGraph(g, disease_id, None) gu.addClassToGraph(g, pheno_id, None) eco_id = self._map_evidence_to_codes(eco) gu.addClassToGraph(g, eco_id, None) if onset is not None and onset.strip() != '': gu.addClassToGraph(g, onset, None) # we want to do things differently depending on the aspect of the annotation if asp == 'O' or asp == 'M': # organ abnormality or mortality assoc = D2PAssoc(self.name, disease_id, pheno_id, onset, freq) elif asp == 'I': # inheritance patterns for the whole disease assoc = DispositionAssoc(self.name, disease_id, pheno_id) elif asp == 'C': # clinical course / onset assoc = DispositionAssoc(self.name, disease_id, pheno_id) else: logger.error("I don't know what this aspect is:", asp) assoc.add_evidence(eco_id) publist = publist.split(';') # blow these apart if there is a list of pubs for pub in publist: pub = pub.strip() if pub != '': # if re.match('http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', pub): # #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced # m = re.search('part\=(\w+)', pub) # pub_id = 'GeneReviews:'+m.group(1) # elif re.search('http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', pub): # m = re.search('Expert=(\d+)', pub) # pub_id = 'Orphanet:'+m.group(1) if not re.match('http', pub): r = Reference(pub) if re.match('PMID', pub): r.setType(Reference.ref_types['journal_article']) r.addRefToGraph(g) # TODO add curator assoc.add_source(pub) assoc.add_association_to_graph(g) if not self.testMode and limit is not None and line_counter > limit: break Assoc(None).load_all_properties(g) return
def _get_pubs(self, entry, g): """ Extract mentioned publications from the reference list :param entry: :return: """ ref_to_pmid = {} entry_num = entry['mimNumber'] model = Model(g) if 'referenceList' in entry: reflist = entry['referenceList'] for r in reflist: if 'pubmedID' in r['reference']: pub_id = 'PMID:' + str(r['reference']['pubmedID']) ref = \ Reference( g, pub_id, Reference.ref_types['journal_article']) else: # make blank node for internal reference pub_id = \ '_:OMIM' + str(entry_num) + 'ref' + \ str(r['reference']['referenceNumber']) ref = Reference(g, pub_id) title = author_list = source = citation = None if 'title' in r['reference']: title = r['reference']['title'] ref.setTitle(title) if 'authors' in r['reference']: author_list = r['reference']['authors'] ref.setAuthorList(author_list) citation = re.split(r'\.\,', author_list)[0] + ' et al' if 'source' in r['reference']: source = r['reference']['source'] citation = '; '.join( list(filter(None.__ne__, [citation, title, source]))) ref.setShortCitation(citation) ref.addRefToGraph() ref_to_pmid[r['reference']['referenceNumber']] = pub_id # add is_about for the pub omim_id = 'OMIM:'+str(entry_num) g.addTriple(omim_id, model.object_properties['mentions'], pub_id) return ref_to_pmid
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ src_key = 'catalog' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) fname = '/'.join((self.rawdir, self.files[src_key]['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = self.globaltt['stem cell'] mouse_taxon = self.globaltt['Mus musculus'] geno = Genotype(graph) with open(fname, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') # First line is header not date/version info. This changed recently, # apparently as of Sep 2019. Also, 3rd line is no longer blank. row = [x.strip() for x in next(reader)] # messy messy col = self.files['catalog']['columns'] strain_missing_allele = [] # to count the ones w/insufficent info if not self.check_fileheader(col, row): pass for row in reader: strain_id = row[col.index('STRAIN/STOCK_ID')].strip() strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')] # strain_type_symbol = row[col.index('STRAIN_TYPE')] strain_state = row[col.index('STATE')] mgi_allele_id = row[col.index( 'MGI_ALLELE_ACCESSION_ID')].strip() mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')] # mgi_allele_name = row[col.index('ALLELE_NAME')] # mutation_type = row[col.index('MUTATION_TYPE')] # chrom = row[col.index('CHROMOSOME')] mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip() mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip() mgi_gene_name = row[col.index('GENE_NAME')] # sds_url = row[col.index('SDS_URL')] # accepted_date = row[col.index('ACCEPTED_DATE')] mpt_ids = row[col.index('MPT_IDS')].strip() pubmed_nums = row[col.index('PUBMED_IDS')].strip() research_areas = row[col.index('RESEARCH_AREAS')].strip() if self.test_mode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set() } # flag bad ones if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '': LOG.error("Erroneous MGI allele id: %s", mgi_allele_id) if mgi_allele_id[:3] == 'MG:': mgi_allele_id = 'MGI:' + mgi_allele_id[3:] else: mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the sequence alteration types # var_type = self.localtt[mutation_type] # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id) # scrub out any spaces, fix known issues mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id == 'NULL': mgi_gene_id = '' elif mgi_gene_id[:7] == 'GeneID:': mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:] if mgi_gene_id != '': try: [curie, localid] = mgi_gene_id.split(':') except ValueError as verror: LOG.warning( "Problem parsing mgi_gene_id %s from file %s: %s", mgi_gene_id, fname, verror) if curie not in ['MGI', 'NCBIGene']: LOG.info("MGI Gene id not recognized: %s", mgi_gene_id) self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - too many. report summary at the end # some things have gene labels, but no identifiers - report if mgi_gene_symbol != '' and mgi_gene_id == '': # LOG.error( # "Gene label with no MGI identifier for strain %s: %s", # strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol) # make a temp id for genes that aren't identified ... err wow. # tmp_gene_id = '_' + mgi_gene_symbol # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mpt_ids are a comma delimited list # labels with MP terms following in brackets phenotype_ids = [] if mpt_ids != '': for lb_mp in mpt_ids.split(r','): lb_mp = lb_mp.strip() if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:': phenotype_ids.append(lb_mp[-11:-2]) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums != '': for pm_num in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + pm_num.strip() pubmed_ids.append(pmid) ref = Reference(graph, pmid, self.globaltt['journal article']) ref.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( # an inst of mouse?? strain_id, strain_label, strain_type, research_areas) model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in some ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(graph, self.name, mgi_allele_id, pid, self.globaltt['has phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: # too chatty here. report aggregate # LOG.info("Phenotypes and no allele for %s", strain_id) strain_missing_allele.append(strain_id) if not self.test_mode and (limit is not None and reader.line_num > limit): break # report misses if strain_missing_allele: LOG.info("Phenotypes and no allele for %i strains", len(strain_missing_allele)) # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if variants: for var in variants: vl_id = var.strip() vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, self.globaltt['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene] + '<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, self.globaltt['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl) + 'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC(vslc_id, vl, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) if vslc_list: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:' + gvc_id gvc_label = '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = re.sub( r':', '', '-'.join( (self.globaltt['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', self.globaltt['unspecified_genomic_background'], "A placeholder for the unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, self.globaltt['unspecified_genomic_background']) geno.addParts(gvc_id, genotype_id, self.globaltt['has_variant_part']) geno.addGenotype(genotype_id, genotype_label) graph.addTriple(s, self.globaltt['has_genotype'], genotype_id) else: # LOG.debug( # "Strain %s is not making a proper genotype.", s) pass LOG.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) LOG.error('%i symbols given are missing their gene identifiers', len(genes_with_no_ids)) return
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[common_name + '_cm']['curie'] if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning( "Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:'+allele_num gene_id = 'WormBase:'+gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) geno.addReagentTargetedGene( rnai_id, 'WormBase:'+gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_'+'-'.join((gene_num, allele_num)) if self.nobnodes: vl_id = ':'+vl_id geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) r = Reference(ref) if re.search(r'Person', ref): r.setType(r.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') r.addRefToGraph(g) assoc.add_source(ref) assoc.add_association_to_graph(g) # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return
def add_disease_drug_variant_to_graph(self, table): """ Takes an iterable of iterables as input with the following structure, optional indices can be Null: [[variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug, therapy_status (optional), pubmed_id(optional)]] See ongoing discussion of how to best model here: https://github.com/monarch-initiative/mckb/issues/9 :param table: iterable of iterables, for example, a tuple of tuples from _get_disease_drug_variant_relationship :return: None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) for row in table: (variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug_label, therapy_status, pubmed_id) = row if specific_diagnosis is not None: diagnoses_label = specific_diagnosis else: diagnoses_label = diagnoses # Arbitrary IDs to be replaced by ontology mappings variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) disease_id = self._get_disease_id(diagnoses_key, diagnoses_label) therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status)) relationship_id = "RO:has_environment" disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_") has_quality_property = "BFO:0000159" drug_id = self._get_drug_id(drug_key, drug_label) geno.addGenotype(variant_id, variant_label, geno.genoparts['sequence_alteration']) disease_instance_id = self.make_cgd_id('disease{0}{1}'.format( diagnoses_label, variant_key)) phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format( diagnoses_label, variant_key, relationship)) phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship) if relationship == "detrimental effect": phenotype_instance_label = "{0} with therapeutic response {1} to health"\ .format(diagnoses_label, relationship) # Reified association for disease caused_by genotype variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label)) # Add individuals/classes gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4') gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888') gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label, disease_id) gu.loadObjectProperties(self.graph, {relationship: relationship_id}) if pubmed_id is not None: source_id = "PMID:{0}".format(pubmed_id) ref = Reference(source_id, Reference.ref_types['journal_article']) ref.addRefToGraph(self.graph) evidence = 'ECO:0000033' else: source_id = None evidence = None rel_id = gu.object_properties['has_phenotype'] variant_phenotype_assoc = G2PAssoc(self.name, variant_id, phenotype_instance_id, rel_id) variant_phenotype_assoc.set_association_id(variant_disease_annot) if evidence: variant_phenotype_assoc.add_evidence(evidence) if source_id: variant_phenotype_assoc.add_source(source_id) variant_phenotype_assoc.add_association_to_graph(self.graph) gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id) gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality) # Add therapy-disease association and approval status marker_relation = "RO:has_biomarker" disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label) gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label, disease_id) gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id) gu.addClassToGraph(self.graph, therapy_status_id, therapy_status) self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id) return
def _process_ddg2p_annotations(self, limit): """ The ddg2p annotations associate a gene symbol to an omim disease, along with some HPO ids and pubs. The gene symbols come from gencode, which in turn come from HGNC official gene symbols. Therefore, we use the HGNC source class to get the id/symbol mapping for use in our annotations here. According to http://www.gencodegenes.org/faq.html, "Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. Sometimes, when there is no official gene symbol, the Havana clone-based name is used." The kind of variation that is linked to a disease is indicated (LOF, GOF, CNV, etc) in the source data. Here, we create an anonymous variant of the specified gene of the indicated type (mapped to the sequence ontology (SO)). :param limit: :return: """ line_counter = 0 if self.graph is not None: graph = self.graph else: graph = self.graph # in order for this to work, we need to map the HGNC id-symbol; hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized) hgnc_symbol_id_map = hgnc.get_symbol_id_map() myzip = ZipFile('/'.join((self.rawdir, self.files['annot']['file'])), 'r') # use the ddg2p.txt file fname = 'ddg2p.txt' unmapped_omim_counter = 0 unmapped_gene_count = 0 with myzip.open(fname, 'r') as f: f = io.TextIOWrapper(f) reader = csv.reader(f, delimiter='\t', quotechar='\"') # score_means_by_measure = {} # strain_scores_by_measure = {} # TODO theseare unused for row in reader: line_counter += 1 if re.match(r'#', row[0]): # skip comments continue (gencode_gene_name, mode, category, consequence, disease, omim, ddg2p_id, pubmed_ids, hpo_codes) = row hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip()) if hgnc_id is None: LOG.error("Couldn't map the gene symbol %s to HGNC.", gencode_gene_name) unmapped_gene_count += 1 continue # add the gene self.model.addClassToGraph(hgnc_id, gencode_gene_name) # TODO make VSLC with the variation # to associate with the disorder # TODO use the Inheritance and Mutation consequence # to classify the VSLCs allele_id = self.make_allele_by_consequence( consequence, hgnc_id, gencode_gene_name) if omim.strip() != '': omim_id = 'OMIM:' + str(omim.strip()) # assume this is declared elsewhere in ontology self.model.addClassToGraph(omim_id, None) # ??? rel is never used # if category.strip() == 'Confirmed DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Probable DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Possible DD gene': # rel = self.self.globaltt['contributes to'] # elif category.strip() == 'Not DD gene': # # TODO negative annotation # continue assoc = G2PAssoc(graph, self.name, allele_id, omim_id) # TODO 'rel' is assigned to but never used for p in re.split(r';', pubmed_ids): p = p.strip() if p != '': pmid = 'PMID:' + str(p) r = Reference(graph, pmid, self.globaltt['journal article']) r.addRefToGraph() assoc.add_source(pmid) assoc.add_association_to_graph() else: # these are unmapped to a disease id. # note that some match OMIM disease labels # but the identifiers are just not included. # TODO consider mapping to OMIM or DOIDs in other ways LOG.warning("No omim id on line %d\n%s", line_counter, str(row)) unmapped_omim_counter += 1 # TODO hpo phenotypes # since the DDG2P file is not documented, # I don't know what the HPO annotations are actually about # are they about the gene? the omim disease? something else? # So, we wont create associations until this is clarified if not self.test_mode and limit is not None and line_counter > limit: break myzip.close() LOG.warning("gene-disorder associations with no omim id: %d", unmapped_omim_counter) LOG.warning("unmapped gene count: %d", unmapped_gene_count) return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ c = 0 books_not_found = set() for nbk in self.book_ids: c += 1 nbk_id = 'GeneReviews:'+nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # logger.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue logger.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = \ soup.find( 'div', id=re.compile(".*Summary.sec0")) if clin_summary is not None: p = clin_summary.find('p') ptext = p.text ptext = re.sub(r'\s+', ' ', ptext) ul = clin_summary.find('ul') if ul is not None: item_text = list() for li in ul.find_all('li'): item_text.append(re.sub(r'\s+', ' ', li.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = \ ' '.join( (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id+']')) self.gu.addDefinition(self.graph, nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=re.compile(r".*Literature_Cited")) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for r in ref_list: for a in r.find_all( 'a', attrs={'href': re.compile(r"pubmed")}): if re.match(r'PubMed:', a.text): pmnum = re.sub(r'PubMed:\s*', '', a.text) else: pmnum = \ re.search( r'\/pubmed\/(\d+)$', a['href']).group(1) if pmnum is not None: pmid = 'PMID:'+str(pmnum) self.gu.addTriple( self.graph, pmid, self.gu.object_properties['is_about'], nbk_id) pmid_set.add(pmnum) r = Reference( pmid, Reference.ref_types['journal_article']) r.addRefToGraph(self.graph) # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.gu.object_properties['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and c > limit: break # finish looping through books l = len(books_not_found) if len(books_not_found) > 0: if l > 100: logger.warning("There were %d books not found.", l) else: logger.warning( "The following %d books were not found locally: %s", l, str(books_not_found)) logger.info( "Finished processing %d books for clinical descriptions", c-l) return
def _process_phenotype_hpoa(self, file_info, limit=None): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ src_key = 'hpoa' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, file_info['file'])) # this will cause two dates to be attached to the dataset # (one from the filedate, and the other from here) # TODO when #112 is implemented, # this will result in only the whole dataset being versioned col = self.files[src_key]['columns'] with open(raw, 'r', encoding="utf8") as tsvfile: reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"') row = next(reader) # drop Description row = str(next(reader))[9:19] LOG.info("Ingest from %s", row) date = datetime.strptime( row.strip(), '%Y-%m-%d').strftime("%Y-%m-%d-%H-%M") if file_info.get("url") is not None: self.dataset.set_ingest_source_file_version_date( file_info.get("url"), date) row = next(reader) # drop tracker url row = next(reader) # drop release url row = next(reader) # headers # row[0] = row[0][1:] # uncomment; but not allways needed ?! if not self.check_fileheader(col, row): pass for row in reader: row = [str(col).strip() for col in row] disease_id = row[col.index('#DatabaseID')] # 98246 OMIM # 68646 ORPHA # 297 DECIPHER if self.test_mode: try: id_list = self.test_ids if id_list is None or disease_id not in id_list: continue except AttributeError: continue # row[col.index('DiseaseName')] unused if row[col.index('Qualifier')] == 'NOT': continue hpo_id = row[col.index('HPO_ID')] publist = row[col.index('Reference')] eco_id = self.resolve(row[col.index('Evidence')]) onset = row[col.index('Onset')] freq = row[col.index('Frequency')] sex = row[col.index('Sex')].lower() # row[col.index('Modifier')] unused asp = row[col.index('Aspect')] # row[col.index('Biocuration')] unused # LOG.info( # 'adding <%s>-to-<%s> because <%s>', disease_id, hpo_id, eco_id) model.addClassToGraph(disease_id) model.addClassToGraph(eco_id) if onset is not None and onset != '': model.addClassToGraph(onset) if asp in ('P', 'M'): # phenotype? abnormality or mortality model.addClassToGraph(hpo_id) assoc = D2PAssoc( # default rel=self.globaltt['has phenotype'] graph, self.name, disease_id, hpo_id, onset, freq ) elif asp in ('I', 'C'): # inheritance pattern or clinical course/onset model.addClassToGraph(hpo_id) assoc = D2PAssoc( graph, self.name, disease_id, hpo_id, rel=self.globaltt['has disposition'] ) else: LOG.error("Unknown aspect : %s at line %i", asp, reader.line_num) assoc.add_evidence(eco_id) if sex is not None and sex != '': self.graph.addTriple( assoc.get_association_id(), self.globaltt['has_sex_specificty'], self.globaltt[sex], object_category=blv.terms['BiologicalSex'] ) # Publication # cut -f 5 phenotype.hpoa | grep ";" | tr ';' '\n' | cut -f1 -d ':' |\ # sort | uniq -c | sort -nr # 629 PMID # 63 OMIM # 42 ISBN-13 # 36 http for pub in publist.split(';'): pub = pub.strip() # there have been several malformed PMIDs if pub[:4] != 'http' and \ graph.curie_regexp.fullmatch(pub) is None: LOG.warning( 'Record %s has a malformed Reference %s', disease_id, pub) continue pubtype = None if pub[:5] == 'PMID:': pubtype = self.globaltt['journal article'] elif pub[:4] == 'ISBN': pubtype = self.globaltt['publication'] elif pub[:5] == 'OMIM:': pub = 'http://omim.org/entry/' + pub[5:] pubtype = self.globaltt['web page'] elif pub[:9] == 'DECIPHER:': pubtype = self.globaltt['web page'] elif pub[:6] == 'ORPHA:': pubtype = self.globaltt['web page'] elif pub[:4] == 'http': pubtype = self.globaltt['web page'] else: LOG.error( 'Unknown pub type for disease %s from "%s"', disease_id, pub) continue if pub is not None: assoc.add_source(pub) if pubtype is not None: ref = Reference(graph, pub, pubtype) # ref.setTitle(''); ref.setYear() ref.addRefToGraph() # TODO add curator # pprint.pprint(assoc) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set() } # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:' + str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + i.strip() pubmed_ids.append(pmid) r = Reference(g, pmid, Reference.ref_types['journal_article']) r.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc( g, self.name, mgi_allele_id, pid, model.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and (limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene] + '<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl) + 'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:' + gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts(gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) g.addTriple(s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return
def _process_QTLs_genomic_location( self, raw, taxon_id, build_id, build_label, limit=None): """ This method Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 geno = Genotype(g) # assume that chrs get added to the genome elsewhere # genome_id = geno.makeGenomeID(taxon_id) # TODO unused eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info("Processing QTL locations for %s", taxon_id) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") # bad_attr_flag = False # TODO unused for row in reader: line_counter += 1 if re.match(r'^#', ' '.join(row)): continue (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row # Chr.Z Animal QTLdb Production_QTL 33954873 34023581 . . . # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01" # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for a in attr_items: if not re.search(r'=', a): # bad_attr_flag = True # TODO unused # remove this attribute from the list bad_attrs.add(a) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.testMode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait qtl_id = 'AQTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, geno.genoparts['QTL']) geno.addTaxon(taxon_id, qtl_id) trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(g, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( g, pub_id, Reference.ref_types['journal_article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( g, self.name, qtl_id, trait_id, model.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): s = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in s: s = re.sub(r',', '.', s) if s.isnumeric(): score = float(s) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_id, 'CHR') chrom_in_build_id = \ makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(g, qtl_id, None, geno.genoparts['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_id) qtl_feature.addFeatureToGraph() if not self.testMode and \ limit is not None and line_counter > limit: break logger.warning("Bad attribute flags in this file") logger.info("Done with QTL genomic mappings for %s", taxon_id) return
def _process_allele_phenotype(self, limit): """ Make allele to phenotype associations using derived_pheno_class and derived_pheno_manifest cvterm in the flybase db, an example entry is: FBal0257663 @FBcv0000351:lethal@ | @FBcv0000308:female limited@, with @FBal0130657:Scer\GAL4<up>dome-PG14</up>@ The first term is the phenotype, and all follow up terms are qualifiers, self.globaltt['has_qualifier']) Our previous approach was to use the genotype id associated with FBal0257663/FBal0130657 , however, this required us to create blank nodes and was considered unnecessarily granular Note that sometimes identifiers do not exist for a term, eg @:heat sensitive | tetracycline conditional@ derived_pheno_class - FBcv terms, these are phenotypes derived_pheno_manifest - Anatomy terms FBbt, we currently make phenotype IRI equivalents that end up in UPheno, but this is being developed and updated, see https://github.com/monarch-initiative/dipper/issues/770 Adds triples to self.graph :param limit: number of rows to process :return: None """ model = Model(self.graph) src_key = 'allele_phenotype' raw = '/'.join((self.rawdir, self.queries[src_key]['file'])) LOG.info("processing allele phenotype associations") col = self.queries[src_key]['columns'] transgenic_alleles = self._get_foreign_transgenic_alleles() # flybase terms - terms we prefix with FlyBase: fly_prefixes = ['FBal', 'FBti', 'FBab', 'FBba', 'FBtp'] # a alphanumeric id followed by a colon then # any character but a colon bordered by @s term_regex = re.compile(r'@([\w]*):([^:@]*)@') id_regex = re.compile(r'([a-zA-Z]+)(\d+)') with open(raw, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') row = next(reader) # headers self.check_fileheader(col, row) for row in reader: allele_id = row[col.index('allele_id')] pheno_desc = row[col.index('pheno_desc')] pheno_type = row[col.index('pheno_type')] pub_id = row[col.index('pub_id')] pub_title = row[col.index('pub_title')] pmid_id = row[col.index('pmid_id')] # Don't get phenotypes for transgenic alleles if allele_id in transgenic_alleles: continue allele_curie = 'FlyBase:' + allele_id terms = re.findall(term_regex, pheno_desc) if not terms: LOG.warning('Could not parse pheno description: %s', pheno_desc) continue term_ids, term_labels = zip(*terms) id_match = re.match(id_regex, term_ids[0]) if id_match is not None: prefix, reference = id_match.group(1, 2) else: raise ValueError("Could not parse id {}".format( term_ids[0])) # derived_pheno_class should all start with a FBcv term if pheno_type == 'derived_pheno_class' and prefix != 'FBcv': LOG.warning( 'derived_pheno_class does not ' 'start with FBcv: %s', pheno_desc) continue # Create phenotype curie if pheno_type == 'derived_pheno_class': phenotype_curie = prefix + ':' + reference elif pheno_type == 'derived_pheno_manifest': if prefix == 'GO': phenotype_curie = prefix + ':' + reference + 'PHENOTYPE' phenotype_label = term_labels[0] + ' phenotype' model.addClassToGraph(phenotype_curie, phenotype_label) else: phenotype_curie = 'OBO:' + prefix + reference + 'PHENOTYPE' else: raise ValueError( "Unexpected phenotype type: {}".format(pheno_type)) if pmid_id: ref_curie = 'PMID:' + pmid_id else: ref_curie = 'FlyBase:' + pub_id reference = Reference(self.graph, ref_curie) reference.setTitle(pub_title) reference.addRefToGraph() assoc = G2PAssoc(self.graph, self.name, allele_curie, phenotype_curie, self.globaltt['has phenotype']) assoc.add_source(ref_curie) # Associations need to be disambiguated via their qualifiers # see http://flybase.org/reports/FBal0207398 as an example assoc.set_association_id( assoc.make_association_id(self.name, allele_curie, self.globaltt['has phenotype'], phenotype_curie, term_ids[1:])) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # add the rest as qualifiers for term in term_ids[1:]: if term: # FBal, GO, FBti, FBab ... id_match = re.match(id_regex, term) if id_match is not None: prefix, reference = id_match.group(1, 2) if prefix in fly_prefixes: term_curie = 'FlyBase:' + term else: term_curie = prefix + ':' + reference else: raise ValueError( "Could not parse id {}".format(term)) else: # There is not an id for a term, # eg @:heat sensitive | tetracycline conditional@ continue self.graph.addTriple(assoc_id, self.globaltt['has_qualifier'], term_curie) if limit is not None and reader.line_num > limit: break
def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ model = Model(self.graph) c = 0 books_not_found = set() for nbk in self.book_ids: c += 1 nbk_id = 'GeneReviews:' + nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # logger.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue logger.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = \ soup.find( 'div', id=re.compile(".*Summary.sec0")) if clin_summary is not None: p = clin_summary.find('p') ptext = p.text ptext = re.sub(r'\s+', ' ', ptext) ul = clin_summary.find('ul') if ul is not None: item_text = list() for li in ul.find_all('li'): item_text.append(re.sub(r'\s+', ' ', li.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = \ ' '.join( (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id+']')) model.addDefinition(nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=re.compile(r".*Literature_Cited")) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for r in ref_list: for a in r.find_all('a', attrs={'href': re.compile(r"pubmed")}): if re.match(r'PubMed:', a.text): pmnum = re.sub(r'PubMed:\s*', '', a.text) else: pmnum = \ re.search( r'\/pubmed\/(\d+)$', a['href']).group(1) if pmnum is not None: pmid = 'PMID:' + str(pmnum) self.graph.addTriple( pmid, model.object_properties['is_about'], nbk_id) pmid_set.add(pmnum) reference = Reference( self.graph, pmid, Reference.ref_types['journal_article']) reference.addRefToGraph() # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.gu.object_properties['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and c > limit: break # finish looping through books l = len(books_not_found) if len(books_not_found) > 0: if l > 100: logger.warning("There were %d books not found.", l) else: logger.warning( "The following %d books were not found locally: %s", l, str(books_not_found)) logger.info("Finished processing %d books for clinical descriptions", c - l) return
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) zp_file = '/'.join((self.rawdir, self.files['zpmap']['file'])) g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file'])) zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file) with open(g2p_file, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (internal_id, symbol, gene_id, subterm1_id, subterm1_label, pc_rel_id, pc_rel_label, superterm1_id, superterm1_label, quality_id, quality_name, modifier, subterm2_id, subterm2_label, pc_rel2_id, pc_rel2_label, superterm2_id, superterm2_label, fish_id, fish_label, start_stage, end_stage, environment, pub_id, figure_id ) = row if modifier != "abnormal": LOG.warning("skipping phenotype with modifier != abnormal: " + modifier) continue zp_id = zfin_parser._map_octuple_to_phenotype(subterm1_id, pc_rel_id, superterm1_id, quality_id, subterm2_id, pc_rel2_id, superterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, self.globaltt['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph()
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = 'AQTLTrait:' + trait_id.strip() # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.testMode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _process_phenotype_tab(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 row = [str(col).strip() for col in row] (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator) = row disease_id = db + ":" + num if self.testMode: try: id_list = self.test_ids if id_list is None \ or disease_id not in id_list: continue except AttributeError: continue # logger.info('adding %s', disease_id) model.addClassToGraph(disease_id, None) model.addClassToGraph(pheno_id, None) eco_id = self._map_evidence_to_codes(eco) model.addClassToGraph(eco_id, None) if onset is not None and onset != '': model.addClassToGraph(onset, None) # we want to do things differently depending on # the aspect of the annotation # TODO PYLINT Redefinition of assoc type from # dipper.models.assoc.D2PAssoc.D2PAssoc to # dipper.models.assoc.DispositionAssoc.DispositionAssoc if asp == 'O' or asp == 'M': # organ abnormality or mortality assoc = D2PAssoc(g, self.name, disease_id, pheno_id, onset, freq) elif asp == 'I': # inheritance patterns for the whole disease assoc = DispositionAssoc(g, self.name, disease_id, pheno_id) elif asp == 'C': # clinical course / onset assoc = DispositionAssoc(g, self.name, disease_id, pheno_id) else: logger.error("I don't know what this aspect is: %s", asp) assoc.add_evidence(eco_id) publist = re.split(r'[,;]', publist) # blow these apart if there is a list of pubs for pub in publist: pub = pub.strip() pubtype = None if pub != '': # if re.match( # r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', # pub): # #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced # m = re.search(r'part\=(\w+)', pub) # pub_id = 'GeneReviews:'+m.group(1) # elif re.search( # r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', # pub): # m = re.search(r'Expert=(\d+)', pub) # pub_id = 'Orphanet:'+m.group(1) if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub): if re.match(r'PMID', pub): pubtype = \ Reference.ref_types['journal_article'] elif re.match(r'HPO', pub): pubtype = Reference.ref_types['person'] else: pubtype = Reference.ref_types['publication'] r = Reference(g, pub, pubtype) r.addRefToGraph() elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub): # make the pubs a reference to the website, # instead of the curie if re.match(r'OMIM', pub): omimnum = re.sub(r'OMIM:', '', pub) omimurl = '/'.join(('http://omim.org/entry', str(omimnum).strip())) pub = omimurl elif re.match(r'Orphanet:', pub): orphanetnum = re.sub(r'Orphanet:', '', pub) orphaneturl = \ ''.join(( 'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=', str(orphanetnum))) pub = orphaneturl elif re.match(r'DECIPHER:', pub): deciphernum = re.sub(r'DECIPHER:', '', pub) decipherurl = '/'.join( ('https://decipher.sanger.ac.uk/syndrome', deciphernum)) pub = decipherurl pubtype = Reference.ref_types['webpage'] elif re.match(r'http', pub): pass else: logger.error('Unknown pub type for %s: %s', disease_id, pub) print(disease_id, 'pubs:', str(publist)) continue if pub is not None: assoc.add_source(pub) # TODO add curator assoc.add_association_to_graph() if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ src_key = 'allele_pheno' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) col = self.files[src_key]['columns'] graph = self.graph model = Model(self.graph) LOG.info("Processing: %s", self.files[src_key]['file']) geno = Genotype(graph) with open(raw, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(reader) if row[0] != '!gaf-version: 2.0': LOG.error('Not a vlaid gaf v2.0 formatted file: %s', raw) # raise for row in reader: if row[0][0] == '!': continue # db = row[col.index('DB')] gene_num = row[col.index('DB Object ID')] # gene_symbol = row[col.index('DB Object Symbol')] is_not = row[col.index('Qualifier')] phenotype_id = row[col.index('GO ID')] ref = row[col.index('DB:Reference (|DB:Reference)')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] # aspect = row[col.index('Aspect')] # gene_name = row[col.index('DB Object Name')] # gene_synonym = row[col.index('DB Object Synonym (|Synonym)')] # gene_class = row[col.index('DB Object Type')] # taxon = row[col.index('Taxon(|taxon)')] # date = row[col.index('Date')] # assigned_by = row[col.index('Assigned By')] # blank = row[col.index('Annotation Extension')] # blank2 = row[col.index('Gene Product Form ID')] # TODO add NOT phenotypes if is_not == 'NOT': continue eco_curie = None if eco_symbol != '' and eco_symbol in self.gaf_eco: eco_curie = self.gaf_eco[eco_symbol] else: LOG.warning( 'Evidence code %s is not found in the (gaf) gaf_eco', eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: LOG.error( "Missing alleles from phenotype assoc at line %d", reader.line_num) continue else: for allele in allele_list: allele_num = re.sub(r'WB:', '', allele.strip()) allele_id = 'WormBase:' + allele_num gene_id = 'WormBase:' + gene_num if re.search(r'WBRNAi', allele_id): # @kshefchek - removing this blank node # in favor of simpler modeling # make the WormBase:WBRNAi* id # a self.globaltt['reagent_targeted_gene'], and attach # phenotype to this ID # Previous model - make a bnode reagent-targeted gene, # & annotate that instead of the RNAi item directly # rnai_num = re.sub(r'WormBase:', '', allele_id) # rnai_id = allele_id # rtg_id = self.make_reagent_targeted_gene_id( # gene_num, rnai_num) # geno.addReagentTargetedGene( # rnai_id, 'WormBase:' + gene_num, rtg_id) # allele_id = rtg_id # Could type the IRI as both the reagant and reagant # targeted gene but not sure if this needed # geno.addGeneTargetingReagent( # allele_id, None, self.globaltt['RNAi_reagent'], gene_id) model.addIndividualToGraph( allele_id, None, self.globaltt['reagent_targeted_gene']) self.graph.addTriple( allele_id, self.globaltt['is_expression_variant_of'], gene_id) elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere # @kshefchek - removing this blank node # in favor of simpler modeling, treat variant # like an allele # vl_id = make_id('+'-'.join((gene_num, allele_num)), '_') # geno.addSequenceAlterationToVariantLocus( # allele_id, vl_id) # geno.addAlleleOfGene(vl_id, gene_id) geno.addSequenceAlteration(allele_id, None) geno.addAlleleOfGene(allele_id, gene_id) else: LOG.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id) if eco_curie is not None: assoc.add_evidence(eco_curie) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) reference = Reference(graph, ref) if re.search(r'Person', ref): reference.setType(self.globaltt['person']) assoc.add_evidence(self.globaltt[ 'inference from background scientific knowledge'] ) reference.addRefToGraph() assoc.add_source(ref) assoc.add_association_to_graph() # finish looping through all alleles if limit is not None and reader.line_num > limit: break
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian??? phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append({ 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = self.globaltt['has phenotype'] if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_')) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id('yeastgenome.org', gene, relation, pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) model.addTriple(subject_id=pheno_id, predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['Phenotype']) # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created Ref prefix in curie map to route to proper reference URL in SGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], self.globaltt['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def _process_qtls_genomic_location( self, raw, txid, build_id, build_label, common_name, limit=None): """ This method Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) # assume that chrs get added to the genome elsewhere taxon_curie = 'NCBITaxon:' + txid eco_id = self.globaltt['quantitative trait analysis evidence'] LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: line_counter += 1 if re.match(r'^#', ' '.join(row)): continue (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row example = ''' Chr.Z Animal QTLdb Production_QTL 33954873 34023581... QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01 ''' str(example) # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for attributes in attr_items: if not re.search(r'=', attributes): # remove this attribute from the list bad_attrs.add(attributes) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.test_mode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait based on taxon qtl_id = common_name + 'QTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL']) geno.addTaxon(taxon_curie, qtl_id) # trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(graph, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): scr = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in scr: scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_curie) qtl_feature.addFeatureToGraph() if not self.test_mode and limit is not None and line_counter > limit: break # LOG.warning("Bad attribute flags in this file") # what does this even mean?? LOG.info("Done with QTL genomic mappings for %s", taxon_curie) return
def _get_pubs(self, entry, graph): """ Extract mentioned publications from the reference list :param entry: :return: """ ref_to_pmid = {} entry_num = entry['mimNumber'] if 'referenceList' in entry: reflist = entry['referenceList'] for rlst in reflist: if 'pubmedID' in rlst['reference']: pub_id = 'PMID:' + str(rlst['reference']['pubmedID']) ref = Reference(graph, pub_id, self.globaltt['journal article']) else: # make blank node for internal reference pub_id = '_:OMIM' + str(entry_num) + 'ref' + str( rlst['reference']['referenceNumber']) ref = Reference(graph, pub_id) title = author_list = source = citation = None if 'title' in rlst['reference']: title = rlst['reference']['title'] ref.setTitle(title) if 'authors' in rlst['reference']: author_list = rlst['reference']['authors'] ref.setAuthorList(author_list) citation = re.split(r'\.\,', author_list)[0] + ' et al' if 'source' in rlst['reference']: source = rlst['reference']['source'] citation = '; '.join([ tok for tok in [citation, title, source] if tok is not None ]) ref.setShortCitation(citation) ref.addRefToGraph() ref_to_pmid[rlst['reference']['referenceNumber']] = pub_id # add is_about for the pub omim_id = 'OMIM:' + str(entry_num) graph.addTriple(omim_id, self.globaltt['mentions'], pub_id) return ref_to_pmid
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not ( re.match(r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not ( re.match(r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def _get_process_allelic_variants(self, entry, graph): model = Model(graph) reference = Reference(graph) geno = Genotype(graph) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, graph) if 'allelicVariantList' in entry: for alv in entry['allelicVariantList']: al_num = alv['allelicVariant']['number'] al_id = 'OMIM:' + str(entry_num) + '.' + str(al_num).zfill( 4) al_label = None al_description = None if alv['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in alv['allelicVariant']: al_label = alv['allelicVariant']['mutations'] if 'text' in alv['allelicVariant']: al_description = alv['allelicVariant']['text'] mch = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(mch) geno.addAllele(al_id, al_label, self.globaltt['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:' + str(entry_num), self.globaltt['is_allele_of']) for ref in publist[al_id]: pmid = ref_to_pmid[int(ref)] graph.addTriple(pmid, self.globaltt['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in alv['allelicVariant']: dbsnp_ids = re.split( r',', alv['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:' + dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) # Note that RCVs are variant to disease associations # in ClinVar, rather than variant entries # so we make these xrefs instead of equivalents if 'clinvarAccessions' in alv['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ alv['allelicVariant']['clinvarAccessions'].split(';;;') rcv_ids = [rcv[:12] for rcv in rcv_ids] # incase more cruft for rnum in rcv_ids: rid = 'ClinVar:' + rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + '#'.join( (str(entry_num), str(al_num).zfill(4)))) elif re.search(r'moved', alv['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in alv['allelicVariant']: moved_id = 'OMIM:' + alv['allelicVariant'][ 'movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: LOG.error('Uncaught alleleic variant status %s', alv['allelicVariant']['status'])
def _get_process_allelic_variants(self, entry, g): model = Model(g) reference = Reference(g) geno = Genotype(g) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele( al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene( al_id, 'OMIM:'+str(entry_num), geno.object_properties[ 'is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] g.addTriple( pmid, model.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = \ re.split(r',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ re.split( r';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [ (re.match(r'(RCV\d+);*', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + str(entry_num)+"#" + str(al_num).zfill(4)) elif re.search( r'moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def _process_ddg2p_annotations(self, limit): """ The ddg2p annotations associate a gene symbol to an omim disease, along with some HPO ids and pubs. The gene symbols come from gencode, which in turn come from HGNC official gene symbols. Therefore, we use the HGNC source class to get the id/symbol mapping for use in our annotations here. According to http://www.gencodegenes.org/faq.html, "Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. Sometimes, when there is no official gene symbol, the Havana clone-based name is used." The kind of variation that is linked to a disease is indicated (LOF, GOF, CNV, etc) in the source data. Here, we create an anonymous variant of the specified gene of the indicated type (mapped to the sequence ontology (SO)). :param limit: :return: """ line_counter = 0 if self.g is not None: g = self.g else: g = self.graph gu = GraphUtils(curie_map.get()) # in order for this to work, we need to map the HGNC id-symbol; hgnc = HGNC() hgnc_symbol_id_map = hgnc.get_symbol_id_map() myzip = ZipFile( '/'.join((self.rawdir, self.files['annot']['file'])), 'r') # use the ddg2p.txt file fname = 'ddg2p.txt' unmapped_omim_counter = 0 unmapped_gene_count = 0 with myzip.open(fname, 'r') as f: f = io.TextIOWrapper(f) reader = csv.reader(f, delimiter='\t', quotechar='\"') # score_means_by_measure = {} # strain_scores_by_measure = {} # TODO theseare unused for row in reader: line_counter += 1 if re.match(r'#', row[0]): # skip comments continue (gencode_gene_name, mode, category, consequence, disease, omim, ddg2p_id, pubmed_ids, hpo_codes) = row hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip()) if hgnc_id is None: logger.error( "Couldn't map the gene symbol %s to HGNC.", gencode_gene_name) unmapped_gene_count += 1 continue # add the gene gu.addClassToGraph(g, hgnc_id, gencode_gene_name) # TODO make VSLC with the variation # to associate with the disorder # TODO use the Inheritance and Mutation consequence # to classify the VSLCs allele_id = self.make_allele_by_consequence( consequence, hgnc_id, gencode_gene_name) if omim.strip() != '': omim_id = 'OMIM:'+str(omim.strip()) # assume this is declared elsewhere in ontology gu.addClassToGraph(g, omim_id, None) if category.strip() == 'Confirmed DD gene': rel = gu.object_properties['has_phenotype'] elif category.strip() == 'Probable DD gene': rel = gu.object_properties['has_phenotype'] elif category.strip() == 'Possible DD gene': rel = gu.object_properties['contributes_to'] elif category.strip() == 'Not DD gene': # TODO negative annotation continue assoc = G2PAssoc(self.name, allele_id, omim_id) # TODO 'rel' is assigned to but never used for p in re.split(r';', pubmed_ids): p = p.strip() if p != '': pmid = 'PMID:'+str(p) r = Reference( pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) assoc.add_source(pmid) assoc.add_association_to_graph(g) else: # these are unmapped to a disease id. # note that some match OMIM disease labels # but the identifiers are just not included. # TODO consider mapping to OMIM or DOIDs in other ways logger.warning( "No omim id on line %d\n%s", line_counter, str(row)) unmapped_omim_counter += 1 # TODO hpo phenotypes # since the DDG2P file is not documented, # I don't know what the HPO annotations are actually about # are they about the gene? the omim disease? something else? # So, we wont create associations until this is clarified if not self.testMode and limit is not None \ and line_counter > limit: break myzip.close() logger.warning( "gene-disorder associations with no omim id: %d", unmapped_omim_counter) logger.warning("unmapped gene count: %d", unmapped_gene_count) gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) graph = self.graph model = Model(self.graph) LOG.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(graph) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row # TODO add NOT phenotypes if is_not == 'NOT': continue eco_symbol = eco_symbol.strip() eco_id = None if eco_symbol.strip() != '': eco_id = self.resolve(eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: LOG.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for allele in allele_list: allele_num = re.sub(r'WB:', '', allele.strip()) allele_id = 'WormBase:' + allele_num gene_id = 'WormBase:' + gene_num if re.search(r'WBRNAi', allele_id): # @kshefchek - removing this blank node # in favor of simpler modeling # make the WormBase:WBRNAi* id # a self.globaltt['reagent_targeted_gene'], and attach # phenotype to this ID # Previous model - make a bnode reagent-targeted gene, # & annotate that instead of the RNAi item directly #rnai_num = re.sub(r'WormBase:', '', allele_id) #rnai_id = allele_id #rtg_id = self.make_reagent_targeted_gene_id( # gene_num, rnai_num) #geno.addReagentTargetedGene( # rnai_id, 'WormBase:' + gene_num, rtg_id) # allele_id = rtg_id # Could type the IRI as both the reagant and reagant # targeted gene but not sure if this needed # geno.addGeneTargetingReagent( # allele_id, None, self.globaltt['RNAi_reagent'], gene_id) model.addIndividualToGraph( allele_id, None, self.globaltt['reagent_targeted_gene']) self.graph.addTriple( allele_id, self.globaltt['is_expression_variant_of'], gene_id) elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere # @kshefchek - removing this blank node # in favor of simpler modeling, treat variant # like an allele #vl_id = '_:'+'-'.join((gene_num, allele_num)) #geno.addSequenceAlterationToVariantLocus( # allele_id, vl_id) #geno.addAlleleOfGene(vl_id, gene_id) geno.addSequenceAlteration(allele_id, None) geno.addAlleleOfGene(allele_id, gene_id) else: LOG.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) reference = Reference(graph, ref) if re.search(r'Person', ref): reference.setType(self.globaltt['person']) assoc.add_evidence( self.globaltt[ 'inference from background scientific knowledge' ]) reference.addRefToGraph() assoc.add_source(ref) assoc.add_association_to_graph() # finish looping through all alleles if limit is not None and line_counter > limit: break return
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. These are filtered on the taxon. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file'])) logger.info("FILE: %s", myfile) assoc_counter = 0 with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, pubmed_num) = line.split('\t') # ## set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) else: model.addIndividualToGraph(gene_id, None) # add the publication as a NamedIndividual # add type publication model.addIndividualToGraph(pubmed_id, None, None) reference = Reference( g, pubmed_id, Reference.ref_types['journal_article']) reference.addRefToGraph() g.addTriple( pubmed_id, model.object_properties['is_about'], gene_id) assoc_counter += 1 if not self.testMode and \ limit is not None and line_counter > limit: break logger.info( "Processed %d pub-gene associations", assoc_counter) return
def _get_var_citations(self, limit): # Generated weekly, the first of the week # A tab-delimited report of citations associated with data in ClinVar, # connected to the AlleleID, the VariationID, and either rs# from dbSNP # or nsv in dbVar. # # AlleleID int value (xpath //Measure/@ID ) # VariationID ID ClinVar uses to anchor default display. # (xpath //MeasureSet/@ID) # rs rs identifier from dbSNP # nsv nsv identifier from dbVar # citation_source The source of the citation, either PubMed, # PubMedCentral, or the NCBI Bookshelf # citation_id The identifier used by that source logger.info("Processing Citations for variants") line_counter = 0 myfile = \ '/'.join((self.rawdir, self.files['variant_citations']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) with open(myfile, 'r', encoding="utf8") as f: filereader = csv.reader(f, delimiter='\t', quotechar='\"') for line in filereader: # skip comments line = line if re.match(r'^#', line[0]): continue (allele_num, variant_num, rs_num, nsv_num, citation_source, citation_id) = line line_counter += 1 if self.testMode: if int(variant_num) not in self.variant_ids: continue if citation_id.strip() == '': logger.info( "Skipping blank citation for ClinVarVariant:%s", str(variant_num)) continue # the citation for a variant is made to some kind of # combination of the ids here. # but i'm not sure which, we don't know what the # citation is for exactly, other than the variant. # so use mentions var_id = 'ClinVarVariant:' + variant_num # citation source: PubMed | PubMedCentral | citation_source # citation id: # format the citation id: ref_id = None if citation_source == 'PubMed': ref_id = 'PMID:' + str(citation_id.replace(" ", "")) model.makeLeader(ref_id) elif citation_source == 'PubMedCentral': ref_id = 'PMCID:' + str(citation_id) if ref_id is not None: r = Reference(self.graph, ref_id, Reference.ref_types['journal_article']) r.addRefToGraph() g.addTriple(ref_id, self.properties['is_about'], var_id) if not self.testMode \ and (limit is not None and line_counter > limit): break logger.info("Finished processing citations for variants") return
def _process_data(self, raw, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) du = DipperUtil() gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (catalog_id, description, omim_number, sample_type, cell_line_available, dna_in_stock, dna_ref, gender, age, race, ethnicity, affected, karyotype, relprob, mutation, gene, family_id, collection, url, cat_remark, pubmed_ids, family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,, # 2,,18343,H**o sapiens if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:'+catalog_id.strip() # Map the cell/sample type cell_type = self._map_cell_type(sample_type) # Make a cell line label line_label = \ collection.partition(' ')[0]+'-'+catalog_id.strip() # Map the repository/collection repository = self._map_collection(collection) # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_person' if self.nobnodes: patient_id = ':'+patient_id if family_id != '': patient_id = \ '-'.join((patient_id, family_id, family_member)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id.strip())) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. short_desc = (description.split(';')[0]).capitalize() if affected == 'Yes': affected = 'affected' elif affected == 'No': affected = 'unaffected' gender = gender.lower() patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = \ ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = \ ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = 'CLO:0000031' gu.addIndividualToGraph( g, cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:'+dna_ref # some of the equivalent ids are not defined # in the source data; so add them gu.addIndividualToGraph( g, equiv_cell_line, None, cell_line_reagent_id) gu.addSameIndividual(g, cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository gu.addMember(g, repository, cell_line_id) if cat_remark != '': gu.addDescription(g, cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # g,age_id,age,self.terms['age']) # gu.addTriple( # g,age_id,self.properties['has_measurement'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. gu.addPerson(g, patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self._map_race(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.terms['race'],mapped_race) # gu.addSubclass( # g,self.terms['ethnic_group'],mapped_race) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if family_id != '': family_comp_id = 'CoriellFamily:'+family_id family_label = \ ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual gu.addIndividualToGraph( g, family_comp_id, family_label, geno.genoparts['family']) # Add the patient as a member of the family gu.addMemberOf(g, patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! if species is None or species == '': species = 'H**o sapiens' taxon = self._map_species(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None if dbsnp_id != '': genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip() omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = du.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = \ '_'+re.sub('MONARCH:', '', self.make_id(karyotype)) if self.nobnodes: karyotype_id = ':'+karyotype_id # add karyotype as karyotype_variation_complement gu.addIndividualToGraph( g, karyotype_id, karyotype, geno.genoparts['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = \ self._get_affected_chromosomes_from_karyotype( karyotype) for c in karyo_chrs: chr_id = makeChromID(c, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, c)) karyotype_feature_label = \ 'some karyotype alteration on chr'+str(c) f = Feature( karyotype_feature_id, karyotype_feature_label, geno.genoparts['sequence_alteration']) f.addFeatureStartLocation(None, chr_id) f.addFeatureToGraph(g) f.loadAllProperties(g) geno.addParts( karyotype_feature_id, karyotype_id, geno.object_properties['has_alternate_part']) if gene != '': vl = gene+'('+mutation+')' # fix the variant_id so it's always in the same order vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' \ and not self._is_normal_karyotype(karyotype): mutation = mutation.strip() gvc_id = karyotype_id if variant_id != '': gvc_id = '_' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass if gvc_id is not None and gvc_id != karyotype_id \ and self.nobnodes: gvc_id = ':'+gvc_id # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = geno.object_properties['has_alternate_part'] if self._is_normal_karyotype(karyotype): karyo_rel = \ geno.object_properties['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for v in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X m = re.match(r'(\d+)\.+(.*)', v.strip()) if m is not None and len(m.groups()) == 2: (locus_num, var_num) = m.groups() if locus_num is not None \ and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for o in omim_map: # gene_id = 'OMIM:' + o # TODO unused vslc_id = \ '_' + '-'.join( [o + '.' + a for a in omim_map.get(o)]) if self.nobnodes: vslc_id = ':'+vslc_id vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts[ 'variant_single_locus_complement']) for v in omim_map.get(o): # this is actually a sequence alt allele1_id = 'OMIM:'+o+'.'+v geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, geno.zygosity['indeterminate'], geno.object_properties[ 'has_alternate_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype gu.addType(g, patient_id, geno.genoparts['wildtype']) elif genotype_id is None: # make an anonymous genotype id genotype_id = '_geno'+catalog_id.strip() if self.nobnodes: genotype_id = ':'+genotype_id # add the gvc if gvc_id is not None: gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = \ geno.object_properties[ 'has_reference_part'] else: rel = \ geno.object_properties[ 'has_alternate_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = \ '; '.join((gvc_label, karyotype)) else: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, geno.object_properties[ 'has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, geno.genoparts['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient gu.addTriple( g, patient_id, geno.properties['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # ############# DEAL WITH THE DISEASES ############# # we associate the disease to the patient if affected == 'affected': if omim_number != '': for d in omim_number.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno if d not in omim_map: disease_id = 'OMIM:'+d.strip() # assume the label is taken care of gu.addClassToGraph(g, disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( self.name, patient_id, disease_id) assoc.add_association_to_graph(g) # this line is a model of this disease # TODO abstract out model into # it's own association class? gu.addTriple( g, cell_line_id, gu.properties['model_of'], disease_id) else: logger.info( 'removing %s from disease list ' + 'since it is a gene', d) # ############# ADD PUBLICATIONS ############# if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:'+s.strip() ref = Reference(pubmed_id) ref.setType(Reference.ref_types['journal_article']) ref.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.properties['mentions'], cell_line_id) if not self.testMode \ and (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) return
def _process_data(self, src_key, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dcterms:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Data from %s", raw) if self.test_mode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[src_key]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.test_mode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph( family_comp_id, family_label, self.globaltt['family'], ind_category=blv. terms['PopulationOfIndividualOrganisms']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': varl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((varl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = varl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = varl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for disease in omim_num.split(';'): if disease is not None and disease != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if disease not in omim_map: disease_id = 'OMIM:' + disease.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', disease) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for pmid in pubmed_ids.split(';'): pubmed_id = 'PMID:' + pmid.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.test_mode and (limit is not None and line_counter > limit): break return
def process_gaf(self, gaffile, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple( gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None: LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) model.addSynonym(gene_id, syn) else: model.addSynonym(gene_id, syn) for txid in taxon.split('|'): tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid) geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = with_or_from.split('|') phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning( "Skipping %s from or with %s", uniprotid, itm) continue itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm) itm = re.sub(r'WB:', 'WormBase:', itm) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot)
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
def _process_phenotype_hpoa(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ src_key = 'hpoa' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filedate = datetime.utcfromtimestamp( os.stat(raw)[ST_CTIME]).strftime("%Y-%m-%d") # this will cause two dates to be attached to the dataset # (one from the filedate, and the other from here) # TODO when #112 is implemented, # this will result in only the whole dataset being versioned col = self.files[src_key]['columns'] with open(raw, 'r', encoding="utf8") as tsvfile: reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"') row = next(reader) # drop Description row = str(next(reader))[9:19] LOG.info("Ingest from %s", row) date = datetime.strptime( row.strip(), '%Y-%m-%d').strftime("%Y-%m-%d-%H-%M") self.dataset.setVersion(filedate, date) row = next(reader) # drop tracker url row = next(reader) # drop release url row = next(reader) # headers row[0] = row[0][1:] # uncomment if col != row: LOG.info( '%s\nExpected Headers:\t%s\nRecived Headers:\t%s\n', src_key, col, row) LOG.info(set(col) - set(row)) for row in reader: if row[0][0] == '#' or row[0] == 'DatabaseID': # headers continue row = [str(col).strip() for col in row] disease_id = row[col.index('DatabaseID')] # 98246 OMIM # 68646 ORPHA # 297 DECIPHER if self.test_mode: try: id_list = self.test_ids if id_list is None or disease_id not in id_list: continue except AttributeError: continue # row[col.index('DiseaseName')] unused if row[col.index('Qualifier')] == 'NOT': continue pheno_id = row[col.index('HPO_ID')] publist = row[col.index('Reference')] eco_id = self.resolve(row[col.index('Evidence')]) onset = row[col.index('Onset')] freq = row[col.index('Frequency')] sex = row[col.index('Sex')].lower() # row[col.index('Modifier')] unused asp = row[col.index('Aspect')] # row[col.index('Biocuration')] unused # LOG.info( # 'adding <%s>-to-<%s> because <%s>', disease_id, pheno_id, eco_id) model.addClassToGraph(disease_id) model.addClassToGraph(pheno_id) model.addClassToGraph(eco_id) if onset is not None and onset != '': model.addClassToGraph(onset) if asp in ('P', 'M'): # phenotype? abnormality or mortality assoc = D2PAssoc( # default rel=self.globaltt['has phenotype'] graph, self.name, disease_id, pheno_id, onset, freq) elif asp in ('I', 'C'): # inheritance pattern or clinical course/onset assoc = D2PAssoc( graph, self.name, disease_id, pheno_id, rel=self.globaltt['has disposition']) else: LOG.error("Unknown aspect : %s at line %i", asp, reader.line_num) assoc.add_evidence(eco_id) if sex is not None and sex != '': self.graph.addTriple( assoc.get_association_id(), self.globaltt['has_sex_specificty'], self.globaltt[sex]) # Publication # cut -f 5 phenotype.hpoa | grep ";" | tr ';' '\n' | cut -f1 -d ':' |\ # sort | uniq -c | sort -nr # 629 PMID # 63 OMIM # 42 ISBN-13 # 36 http for pub in publist.split(';'): pub = pub.strip() # there have been several malformed PMIDs if pub[:4] != 'http' and \ graph.curie_regexp.fullmatch(pub) is None: LOG.warning( 'Record %s has a malformed Reference %s', disease_id, pub) continue pubtype = None if pub[:5] == 'PMID:': pubtype = self.globaltt['journal article'] elif pub[:4] == 'ISBN': pubtype = self.globaltt['publication'] elif pub[:5] == 'OMIM:': pub = 'http://omim.org/entry/' + pub[5:] pubtype = self.globaltt['web page'] elif pub[:9] == 'DECIPHER:': pubtype = self.globaltt['web page'] elif pub[:6] == 'ORPHA:': pubtype = self.globaltt['web page'] elif pub[:4] == 'http': pubtype = self.globaltt['web page'] else: LOG.error( 'Unknown pub type for disease %s from "%s"', disease_id, pub) continue if pub is not None: assoc.add_source(pub) if pubtype is not None: ref = Reference(graph, pub, pubtype) # ref.setTitle(''); ref.setYear() ref.addRefToGraph() # TODO add curator # pprint.pprint(assoc) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning("Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:' + allele_num gene_id = 'WormBase:' + gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num) geno.addReagentTargetedGene( rnai_id, 'WormBase:' + gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_:' + '-'.join((gene_num, allele_num)) geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(g, self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) reference = Reference(g, ref) if re.search(r'Person', ref): reference.setType( reference.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') reference.addRefToGraph() assoc.add_source(ref) assoc.add_association_to_graph() # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.testMode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) logger.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: logger.warning( "Wrong number of columns {}, expected 15 or 17\n{}" .format(len(row), row) ) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation on row %d:\n"+'\t' .join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if db in self.localtt: db = self.localtt[db] uniprotid = None gene_id = None if db == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((db, gene_num)) (db, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # logger.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((db, gene_num)) if self.testMode and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info( ">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: logger.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if 'PMID' == prefix: ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: logger.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: logger.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id(gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot logger.info( "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data . Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Turtle: <eom id> a owl:Class rdf:label Literal(eom label) oboInOwl:has_related_synonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) TEC_note: URL are not literals. :param raw: :param limit: :return: """ src_key = 'tables' model = Model(self.graph) col = self.resources[src_key]['columns'] with open(raw, 'r') as rawread: reader = csv.reader(rawread, delimiter='\t', quotechar='\"') row = next(reader) if not self.check_fileheader(col, row): pass for row in reader: # head -1 dvp.pr_nlx_157874_1|tr '\t' '\n'| # sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" morphology_term_id = row[col.index( 'morphology_term_id')].strip() # morphology_term_num = row[col.index('morphology_term_num')] morphology_term_label = row[col.index( 'morphology_term_label')].strip() morphology_term_url = row[col.index( 'morphology_term_url')].strip() # terminology_category_label = row[ # col.index('terminology_category_label')] # terminology_category_url = row[col.index('terminology_category_url')] # subcategory = row[col.index('subcategory')] objective_definition = row[col.index( 'objective_definition')].strip() subjective_definition = row[col.index( 'subjective_definition')].strip() comments = row[col.index('comments')].strip() synonyms = row[col.index('synonyms')].strip() replaces = row[col.index('replaces')].strip() small_figure_url = row[col.index('small_figure_url')].strip() large_figure_url = row[col.index('large_figure_url')].strip() # e_uid = row[col.index('e_uid')] # v_uid = row[col.index('v_uid')] # v_uuid = row[col.index('v_uuid')] # v_lastmodified = row[col.index('v_lastmodified')] # v_status = row[col.index('v_status')] # v_lastmodified_epoch = row[col.index('v_lastmodified_epoch')] # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition + '.' definition = ' '.join( (objective_definition, subjective_definition)) model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments) for syn in synonyms.split(';'): model.addSynonym(morphology_term_id, syn.strip(), self.globaltt['has_exact_synonym']) # morphology_term_id has_related_synonym replaces (; delimited) if replaces not in ['', synonyms]: for syn in replaces.split(';'): model.addSynonym(morphology_term_id, syn.strip(), self.globaltt['has_related_synonym']) # <morphology_term_id> <foaf:page> morphology_term_url if morphology_term_id is not None: reference = Reference(self.graph, morphology_term_id, self.globaltt['web page']) # TEC 201905: # Not so sure we need explicit <eom_uri> <webpage> <eom_url>. # since <eom_uri> IS the <eom_url>. reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and reader.line_num > limit: break
def _process_phenotype_tab(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 row = [str(col).strip() for col in row] # Note from Seb in Dec 2017, a 15th column was added # inadverterntly and will be removed in the winter 2018 # release of hpo data (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator, extra) = row disease_id = db + ":" + num if self.testMode: try: id_list = self.test_ids if id_list is None \ or disease_id not in id_list: continue except AttributeError: continue # logger.info('adding %s', disease_id) model.addClassToGraph(disease_id, None) model.addClassToGraph(pheno_id, None) eco_id = self._map_evidence_to_codes(eco) model.addClassToGraph(eco_id, None) if onset is not None and onset != '': model.addClassToGraph(onset, None) # we want to do things differently depending on # the aspect of the annotation # TODO PYLINT Redefinition of assoc type from # dipper.models.assoc.D2PAssoc.D2PAssoc to # dipper.models.assoc.DispositionAssoc.DispositionAssoc if asp == 'O' or asp == 'M': # organ abnormality or mortality assoc = D2PAssoc( g, self.name, disease_id, pheno_id, onset, freq) elif asp == 'I': # inheritance patterns for the whole disease assoc = DispositionAssoc( g, self.name, disease_id, pheno_id) elif asp == 'C': # clinical course / onset assoc = DispositionAssoc( g, self.name, disease_id, pheno_id) else: logger.error("I don't know what this aspect is: %s", asp) assoc.add_evidence(eco_id) publist = re.split(r'[,;]', publist) # blow these apart if there is a list of pubs for pub in publist: pub = pub.strip() pubtype = None if pub != '': # if re.match( # r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', # pub): # #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced # m = re.search(r'part\=(\w+)', pub) # pub_id = 'GeneReviews:'+m.group(1) # elif re.search( # r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', # pub): # m = re.search(r'Expert=(\d+)', pub) # pub_id = 'Orphanet:'+m.group(1) if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub): if re.match(r'PMID', pub): pubtype = \ Reference.ref_types['journal_article'] elif re.match(r'HPO', pub): pubtype = Reference.ref_types['person'] else: pubtype = Reference.ref_types['publication'] r = Reference(g, pub, pubtype) r.addRefToGraph() elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub): # make the pubs a reference to the website, # instead of the curie if re.match(r'OMIM', pub): omimnum = re.sub(r'OMIM:', '', pub) omimurl = '/'.join(('http://omim.org/entry', str(omimnum).strip())) pub = omimurl elif re.match(r'Orphanet:', pub): orphanetnum = re.sub(r'Orphanet:', '', pub) orphaneturl = \ ''.join(( 'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=', str(orphanetnum))) pub = orphaneturl elif re.match(r'DECIPHER:', pub): deciphernum = re.sub(r'DECIPHER:', '', pub) decipherurl = '/'.join( ('https://decipher.sanger.ac.uk/syndrome', deciphernum)) pub = decipherurl pubtype = Reference.ref_types['webpage'] elif re.match(r'http', pub): pass else: logger.error('Unknown pub type for %s: %s', disease_id, pub) print(disease_id, 'pubs:', str(publist)) continue if pub is not None: assoc.add_source(pub) # TODO add curator assoc.add_association_to_graph() if not self.testMode \ and limit is not None and line_counter > limit: break return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append( { 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # False = phenotype was descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = Model.object_properties['has_phenotype'] # has phenotype if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_') ) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id(definedby='yeastgenome.org', subject=gene, predicate=relation, object=pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) # make pheno subclass of UPHENO:0001001 model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001') # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created RGDRef prefix in curie map to route to proper reference URL in RGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference( self.graph, references[0], Reference.ref_types['publication'] ) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. These are filtered on the taxon. :param limit: :return: """ src_key = 'gene2pubmed' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", myfile) assoc_counter = 0 col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: line_counter += 1 # skip comments row = line.decode().strip().split('\t') if row[0][0] == '#': continue tax_num = row[col.index('tax_id')].strip() gene_num = row[col.index('GeneID')].strip() pubmed_num = row[col.index('PubMed_ID')].strip() # ## set id_filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.test_mode and int(gene_num) not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) else: model.addIndividualToGraph(gene_id, None) # add the publication as a NamedIndividual # add type publication model.addIndividualToGraph(pubmed_id, None, None) reference = Reference(graph, pubmed_id, self.globaltt['journal article']) reference.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['is_about'], gene_id) assoc_counter += 1 if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Processed %d pub-gene associations", assoc_counter)
def _get_var_citations(self, limit): # Generated weekly, the first of the week # A tab-delimited report of citations associated with data in ClinVar, # connected to the AlleleID, the VariationID, and either rs# from dbSNP # or nsv in dbVar. # # AlleleID int value (xpath //Measure/@ID ) # VariationID ID ClinVar uses to anchor default display. # (xpath //MeasureSet/@ID) # rs rs identifier from dbSNP # nsv nsv identifier from dbVar # citation_source The source of the citation, either PubMed, # PubMedCentral, or the NCBI Bookshelf # citation_id The identifier used by that source logger.info("Processing Citations for variants") line_counter = 0 myfile = \ '/'.join((self.rawdir, self.files['variant_citations']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) with open(myfile, 'r', encoding="utf8") as f: filereader = csv.reader(f, delimiter='\t', quotechar='\"') for line in filereader: # skip comments line = line if re.match(r'^#', line[0]): continue (allele_num, variant_num, rs_num, nsv_num, citation_source, citation_id) = line line_counter += 1 if self.testMode: if int(variant_num) not in self.variant_ids: continue if citation_id.strip() == '': logger.info( "Skipping blank citation for ClinVarVariant:%s", str(variant_num)) continue # the citation for a variant is made to some kind of # combination of the ids here. # but i'm not sure which, we don't know what the # citation is for exactly, other than the variant. # so use mentions var_id = 'ClinVarVariant:'+variant_num # citation source: PubMed | PubMedCentral | citation_source # citation id: # format the citation id: ref_id = None if citation_source == 'PubMed': ref_id = 'PMID:'+str(citation_id.replace(" ", "")) model.makeLeader(ref_id) elif citation_source == 'PubMedCentral': ref_id = 'PMCID:'+str(citation_id) if ref_id is not None: r = Reference( self.graph, ref_id, Reference.ref_types['journal_article']) r.addRefToGraph() g.addTriple( ref_id, self.properties['is_about'], var_id) if not self.testMode \ and (limit is not None and line_counter > limit): break logger.info("Finished processing citations for variants") return
def _process_qtls_genetic_location( self, raw, src_key, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[src_key]['curie'] common_name = common_name.strip() if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header in these files, so no header checking col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing %s line %i containing: \n%s\n" "got %i cols but expected %i", raw, reader.line_num, row, len(row), col_len) # LOG.info(row) continue qtl_id = row[col.index('QTL_ID')].strip() qtl_symbol = row[col.index('QTL_symbol')].strip() trait_name = row[col.index('Trait_name')].strip() # assotype = row[col.index('assotype')].strip() chromosome = row[col.index('Chromosome')].strip() position_cm = row[col.index('Position_cm')].strip() range_cm = row[col.index('range_cm')].strip() # flankmark_a2 = row[col.index('FlankMark_A2')].strip() # flankmark_a1 = row[col.index('FlankMark_A1')].strip() peak_mark = row[col.index('Peak_Mark')].strip() # flankmark_b1 = row[col.index('FlankMark_B1')].strip() # flankmark_b2 = row[col.index('FlankMark_B2')].strip() # exp_id = row[col.index('Exp_ID')].strip() # model_id = row[col.index('Model')].strip() # test_base = row[col.index('testbase')].strip() # sig_level = row[col.index('siglevel')].strip() # lod_score = row[col.index('LOD_score')].strip() # ls_mean = row[col.index('LS_mean')].strip() p_values = row[col.index('P_values')].strip() # f_statistics = row[col.index('F_Statistics')].strip() # variance = row[col.index('VARIANCE')].strip() # bayes_value = row[col.index('Bayes_value')].strip() # likelihood_ratio = row[col.index('LikelihoodR')].strip() trait_id = row[col.index('TRAIT_ID')].strip() # dom_effect = row[col.index('Dom_effect')].strip() # add_effect = row[col.index('Add_effect')].strip() pubmed_id = row[col.index('PUBMED_ID')].strip() gene_id = row[col.index('geneID')].strip() gene_id_src = row[col.index('geneIDsrc')].strip() # gene_id_type = row[col.index('geneIDtype')].strip() if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:' + common_name + '-linkage' build_label = common_name + ' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:' + peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref( qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant']) gene_id = gene_id.replace('uncharacterized ', '').strip() gene_id = gene_id.strip(',') # for "100157483," in pig_QTLdata.txt if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id as a bnode vl_id = self.make_id(re.sub( r':', '', gene_id) + '-' + peak_mark.strip(), '_') geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph( trait_id, trait_name, class_category=blv.terms['PhenotypicFeature']) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:' + pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # off by one - the following actually gives us (limit + 1) records if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with QTL genetic info")
def _process_qtls_genomic_location( self, raw, src_key, txid, build_id, build_label, common_name, limit=None): """ This method Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # assume that chrs get added to the genome elsewhere taxon_curie = 'NCBITaxon:' + txid eco_id = self.globaltt['quantitative trait analysis evidence'] LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") # no header in GFF, so no header checking col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if row[0][0] == '#': # LOG.info(row) continue if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing in %s row %s\n" "got %s cols but expected %s", raw, reader.line_num, len(row), col_len) LOG.info(row) continue chromosome = row[col.index('SEQNAME')].strip() # qtl_source = row[col.index('SOURCE')].strip() # qtl_type = row[col.index('FEATURE')].strip() start_bp = row[col.index('START')].strip() stop_bp = row[col.index('END')].strip() # score = row[col.index('SCORE')].strip() strand = row[col.index('STRAND')].strip() # frame = row[col.index('FRAME')].strip() attr = row[col.index('ATTRIBUTE')].strip() example = ''' Chr.Z Animal QTLdb Production_QTL 33954873 34023581... QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01 ''' str(example) # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for attributes in attr_items: if not re.search(r'=', attributes): # remove this attribute from the list bad_attrs.add(attributes) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.test_mode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait based on taxon qtl_id = common_name + 'QTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL']) geno.addTaxon(taxon_curie, qtl_id) # trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(graph, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): scr = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in scr: scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_curie) qtl_feature.addFeatureToGraph() if not self.test_mode and limit is not None and reader.line_num > limit: break # LOG.warning("Bad attribute flags in this file") # what does this even mean?? LOG.info("Done with QTL genomic mappings for %s", taxon_curie)
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. These are filtered on the taxon. :param limit: :return: """ if self.testMode: graph = self.testgraph else: graph = self.graph model = Model(graph) logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file'])) logger.info("FILE: %s", myfile) assoc_counter = 0 with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, pubmed_num) = line.split('\t') # ## set id_filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) else: model.addIndividualToGraph(gene_id, None) # add the publication as a NamedIndividual # add type publication model.addIndividualToGraph(pubmed_id, None, None) reference = Reference(graph, pubmed_id, self.globaltt['journal article']) reference.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['is_about'], gene_id) assoc_counter += 1 if not self.testMode and limit is not None and line_counter > limit: break logger.info("Processed %d pub-gene associations", assoc_counter) return