def _get_mappedids(self, entry, g): """ Extract the Orphanet and UMLS ids as equivalences from the entry :param entry: :return: """ model = Model(g) omimid = 'OMIM:'+str(entry['mimNumber']) orpha_mappings = [] if 'externalLinks' in entry: links = entry['externalLinks'] if 'orphanetDiseases' in links: # triple semi-colon delimited list of # double semi-colon delimited orphanet ID/disease pairs # 2970;;566;;Prune belly syndrome items = links['orphanetDiseases'].split(';;;') for i in items: # note 'internal_num unused (orpha_num, internal_num, orpha_label) = i.split(';;') orpha_id = 'Orphanet:'+orpha_num.strip() orpha_mappings.append(orpha_id) model.addClassToGraph(orpha_id, orpha_label.strip()) model.addXref(omimid, orpha_id) if 'umlsIDs' in links: umls_mappings = links['umlsIDs'].split(',') for i in umls_mappings: umls_id = 'UMLS:'+i model.addClassToGraph(umls_id, None) model.addXref(omimid, umls_id) return
def _get_phenotypicseries_parents(entry, g): """ Extract the phenotypic series parent relationship out of the entry :param entry: :return: """ model = Model(g) omimid = 'OMIM:'+str(entry['mimNumber']) # the phenotypic series mappings serieslist = [] if 'phenotypicSeriesExists' in entry: if entry['phenotypicSeriesExists'] is True: if 'phenotypeMapList' in entry: phenolist = entry['phenotypeMapList'] for p in phenolist: serieslist.append( p['phenotypeMap']['phenotypicSeriesNumber']) if 'geneMap' in entry and \ 'phenotypeMapList' in entry['geneMap']: phenolist = entry['geneMap']['phenotypeMapList'] for p in phenolist: if 'phenotypicSeriesNumber' in p['phenotypeMap']: serieslist.append( p['phenotypeMap']['phenotypicSeriesNumber']) # add this entry as a subclass of the series entry for ser in serieslist: series_id = 'OMIM:'+ser model.addClassToGraph(series_id, None) model.addSubClass(omimid, series_id) return
def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip for line in f1: line_counter += 1 row = line.split('\t') ( morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = row # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class model.addClassToGraph(hp_id, None) # Add the HP ID as an equivalent class model.addEquivalentClass(morphology_term_id, hp_id) else: LOG.warning('No matching HP term for %s', morphology_term_label) if limit is not None and line_counter > limit: break return
def _add_variant_trait_association(self, variant_id, mapped_trait_uri, efo_ontology, pubmed_id, description=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for trait in re.split(r',', mapped_trait_uri): trait = trait.strip() trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_", "EFO:") phenotype_query = """ SELECT ?trait WHERE {{ <{0}> rdfs:subClassOf+ <http://www.ebi.ac.uk/efo/EFO_0000651> . <{0}> rdfs:label ?trait . }} """.format(trait) query_result = efo_ontology.query(phenotype_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_curie): model.addClassToGraph( trait_curie, list(query_result)[0][0], 'UPHENO:0001001') pubmed_curie = 'PMID:' + pubmed_id ref = Reference( g, pubmed_curie, Reference.ref_types['journal_article']) ref.addRefToGraph() assoc = G2PAssoc( g, self.name, variant_id, trait_curie, model.object_properties['contributes_to']) assoc.add_source(pubmed_curie) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) if description is not None: assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) if trait_curie is not None: assoc.add_association_to_graph()
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing: %s", self.files['gene_ids']['file']) line_counter = 0 geno = Genotype(g) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live, gene_type) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:'+taxon_num gene_id = 'WormBase:'+gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None model.addClassToGraph( gene_id, gene_symbol, Genotype.genoparts['gene']) if live == 'Dead': model.addDeprecatedClass(gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '' and gene_synonym is not None: model.addSynonym(gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) model = Model(self.graph) line_counter = 0 with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') header = next(filereader) line_counter = 1 colcount = len(header) if colcount != 4: # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID') logger.error("Unexpected Header ", header) exit(-1) for row in filereader: line_counter += 1 if len(row) != colcount: logger.error("Unexpected row. got: ", row) logger.error("Expected data for: ", header) exit(-1) (shortname, title, nbk_num, pmid) = row gr_id = 'GeneReviews:'+nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or line_counter < limit: model.addClassToGraph(gr_id, title) model.addSynonym(gr_id, shortname) # TODO include the new PMID? return
def _process_orthologs(self, raw, limit=None): """ This method maps orthologs for a species to the KEGG orthology classes. Triples created: <gene_id> is a class <orthology_class_id> is a class <assoc_id> has subject <gene_id> <assoc_id> has object <orthology_class_id> :param limit: :return: """ LOG.info("Processing orthologs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (gene_id, orthology_class_id) = row orthology_class_id = 'KEGG:'+orthology_class_id.strip() gene_id = 'KEGG:' + gene_id.strip() # note that the panther_id references a group of orthologs, # and is not 1:1 with the rest # add the KO id as a gene-family grouping class OrthologyAssoc( graph, self.name, gene_id, None).add_gene_family_to_graph( orthology_class_id) # add gene and orthology class to graph; # assume labels will be taken care of elsewhere model.addClassToGraph(gene_id, None) model.addClassToGraph(orthology_class_id, None) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with orthologs")
def _process_genes_kegg2ncbi(self, limit=None): """ This method maps the KEGG human gene IDs to the corresponding NCBI Gene IDs. Triples created: <kegg_gene_id> is a class <ncbi_gene_id> is a class <kegg_gene_id> equivalentClass <ncbi_gene_id> :param limit: :return: """ LOG.info("Processing KEGG gene IDs to NCBI gene IDs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files['ncbi']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, ncbi_gene_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue # Adjust the NCBI gene ID prefix. ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id) kegg_gene_id = 'KEGG-' + kegg_gene_id # Adding the KEGG gene ID to the graph here is redundant, # unless there happens to be additional gene IDs in this table # not present in the genes table. model.addClassToGraph(kegg_gene_id, None) model.addClassToGraph(ncbi_gene_id, None) model.addEquivalentClass(kegg_gene_id, ncbi_gene_id) if not self.test_mode and ( limit is not None and reader.line_num > limit): break LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
def _process_interactions(self, row): """ Process row of CTD data from CTD_chemicals_diseases.tsv.gz and generate triples. Only create associations based on direct evidence (not using the inferred-via-gene), and unambiguous relationships. (Ambiguous ones will be processed in the sister method using the disambiguated file). There are no OMIM ids for diseases in these cases, so we associate with only the mesh disease ids. Args: :param row (list): row of CTD data Returns: :return None """ model = Model(self.graph) self._check_list_len(row, 10) (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence, inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row if direct_evidence == '': return evidence_pattern = re.compile(r'^therapeutic|marker\/mechanism$') # dual_evidence = re.compile(r'^marker\/mechanism\|therapeutic$') # filter on those diseases that are mapped to omim ids in the test set intersect = list( set(['OMIM:' + str(i) for i in omim_ids.split('|')] + [disease_id]) & set(self.test_diseaseids)) if self.test_mode and len(intersect) < 1: return chem_id = 'MESH:' + chem_id reference_list = self._process_pubmed_ids(pubmed_ids) if re.match(evidence_pattern, direct_evidence): rel_id = self.resolve(direct_evidence) model.addClassToGraph(chem_id, chem_name) model.addClassToGraph(disease_id, None) self._make_association(chem_id, disease_id, rel_id, reference_list) else: # there's dual evidence, but haven't mapped the pubs pass # LOG.debug( # "Dual evidence for %s (%s) and %s (%s)", # chem_name, chem_id, disease_name, disease_id) return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # tax_num = '9606' # TODO PYLINT unused tax_id = 'NCBITaxon:9606' tax_label = 'Human' # add genome and taxon geno.addGenome(tax_id, tax_label) # tax label can get added elsewhere model.addClassToGraph(tax_id, None) # label added elsewhere includes = set() includes.add('all') self.process_entries( omimids, self._transform_entry, includes, g, limit) return
def _process_diseases(self, limit=None): """ This method processes the KEGG disease IDs. Triples created: <disease_id> is a class <disease_id> rdfs:label <disease_name> :param limit: :return: """ LOG.info("Processing diseases") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files['disease']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (disease_id, disease_name) = row disease_id = 'KEGG-'+disease_id.strip() if disease_id not in self.label_hash: self.label_hash[disease_id] = disease_name if self.test_mode and disease_id not in self.test_ids['disease']: continue # Add the disease as a class. # we don't get all of these from MONDO yet see: # https://github.com/monarch-initiative/human-disease-ontology/issues/3 model.addClassToGraph(disease_id, disease_name) # not typing the diseases as DOID:4 yet because # I don't want to bulk up the graph unnecessarily if not self.test_mode and ( limit is not None and reader.line_num > limit): break LOG.info("Done with diseases")
def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) model = Model(self.graph) col = ['GR_shortname', 'GR_Title', 'NBK_id', 'PMID'] with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) row[0] = row[0][1:] colcount = len(col) if not self.check_fileheader(col, row): exit(-1) for row in filereader: if len(row) != colcount: LOG.error("Unexpected row. got: %s", row) LOG.error("Expected data for: %s", col) exit(-1) nbk_num = row[col.index('NBK_id')] gr_id = 'GeneReviews:' + nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or filereader.line_num < limit: model.addClassToGraph(gr_id, row[col.index('GR_Title')]) model.addSynonym(gr_id, row[col.index('GR_shortname')])
def _process_phenotypicseries(self, limit): """ Creates classes from the OMIM phenotypic series list. These are grouping classes to hook the more granular OMIM diseases. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph logger.info("getting phenotypic series titles") model = Model(g) line_counter = 0 with open( '/'.join( (self.rawdir, self.files['phenotypicSeries']['file']))) as f: # there's several lines of header in the file, # so need to skip several lines: f.readline() # OMIM Phenotypic Series Titles f.readline() # Downloaded: Apr 14, 2016 f.readline() # Copyright (c) 1966-2015 f.readline() # <blank> f.readline() # Phenotypic Series Title Phenotypic Series number for line in f: if re.match(r'^\w*$', line): # skip blank lines, continue line = line.strip() line_counter += 1 (ps_label, ps_num) = line.split('\t') omim_id = 'OMIM:'+ps_num model.addClassToGraph(omim_id, ps_label) if not self.testMode and \ limit is not None and line_counter > limit: break return
def _process_trait_mappings(self, raw, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub( r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
def _process_pathway(self, row): """ Process row of CTD data from CTD_genes_pathways.tsv.gz and generate triples Args: :param row (list): row of CTD data Returns: :return None """ model = Model(self.graph) self._check_list_len(row, 4) (gene_symbol, gene_id, pathway_name, pathway_id) = row if self.test_mode and (int(gene_id) not in self.test_geneids): return entrez_id = 'NCBIGene:' + gene_id pathways_to_scrub = [ 'REACT:REACT_116125', # disease "REACT:REACT_111045", # developmental biology "REACT:REACT_200794", # Mus musculus biological processes "REACT:REACT_13685"] # neuronal system ? if pathway_id in pathways_to_scrub: # these are lame "pathways" like generic # "disease" and "developmental biology" return # convert KEGG pathway ids... KEGG:12345 --> KEGG-path:map12345 if re.match(r'KEGG', pathway_id): pathway_id = re.sub(r'KEGG:', 'KEGG-path:map', pathway_id) # just in case, add it as a class model.addClassToGraph(entrez_id, None) self.pathway.addPathway(pathway_id, pathway_name) self.pathway.addGeneToPathway(entrez_id, pathway_id) return
def _parse_curated_chem_disease(self, limit): model = Model(self.g) line_counter = 0 file_path = '/'.join( (self.rawdir, self.static_files['publications']['file'])) with open(file_path, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # catch comment lines if re.match(r'^#', ' '.join(row)): continue line_counter += 1 self._check_list_len(row, 10) (pub_id, disease_label, disease_id, disease_cat, evidence, chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row if disease_id.strip() == '' or chem_id.strip() == '': continue rel_id = self._get_relationship_id(evidence) chem_id = 'MESH:' + chem_id model.addClassToGraph(chem_id, chem_label) model.addClassToGraph(disease_id, None) if pub_id != '': pub_id = 'PMID:' + pub_id r = Reference( pub_id, Reference.ref_types['journal_article']) r.addRefToGraph(self.g) pubids = [pub_id] else: pubids = None self._make_association(chem_id, disease_id, rel_id, pubids) if not self.testMode and limit is not None \ and line_counter >= limit: break return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:'+build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = \ placed_scaffold_pattern+r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' m = re.match(placed_scaffold_pattern+r'$', scaffold) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = m.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if m: pass elif m_chr_unloc is not None and\ len(m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num+'_'+m_chr_unloc.group(2) elif m_chr_unplaced is not None and\ len(m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': Feature.types['chromosome']} if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': Feature.types['assembly_component'], 'synonym': scaffold} if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num+band_num] = {'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None} # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num+band_num]['stain'] = \ Feature.types.get(rtype) # get the parent bands, and make them unique parents = list( monochrom.make_parent_bands(band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num+band_num]['parent'] = \ chrom_num+parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num+parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash b = {'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti} mybands[pnum] = b else: # band already in the hash means it's a grouping band # need to update the min/max coords b = mybands.get(pnum) b['min'] = min(sta, sto, b['min']) b['max'] = max(sta, sto, b['max']) mybands[pnum] = b # also, set the max for the chrom c = mybands.get(chrom_num) c['max'] = max(sta, sto, c['max']) mybands[chrom_num] = c # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num+parents[i+1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for b in mybands.keys(): myband = mybands.get(b) band_class_id = makeChromID(b, taxon, 'CHR') band_class_label = makeChromLabel(b, genome_label) band_build_id = makeChromID(b, build_num, 'MONARCH') band_build_label = makeChromLabel(b, build_num) # the build-specific chrom chrom_in_build_id = makeChromID( myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != Feature.types['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == Feature.types['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == Feature.types['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: # TODO 'has_staining_intensity' being dropped by MB bfeature.addFeatureProperty( Feature.properties['has_staining_intensity'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _process_genes(self, taxid, limit=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, peptide_id, uniprot_swissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None # gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None model.addClassToGraph(gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide")) model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide")) if entrezgene != '': model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprot_swissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _fill_provenance_graph(self, limit): logger.info("Building graph ...") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) taxon_id = 'NCBITaxon:10090' # hardcode to Mus musculus model.addClassToGraph(taxon_id, None) scores_passing_threshold_count = 0 scores_passing_threshold_with_ontologies_count = 0 scores_not_passing_threshold_count = 0 # loop through all the strains, # and make G2P assoc for those with scores beyond threshold for strain_num in self.strain_scores_by_measure: if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(strain_num) for sex in self.strain_scores_by_measure[strain_num]: measures = self.strain_scores_by_measure[strain_num][sex] for m in measures: assay_id = 'MPD-assay:'+str(m) # TODO consider using the means # instead of precomputed zscores if 'zscore' in measures[m]: zscore = measures[m]['zscore'] if abs(zscore) >= self.stdevthreshold: scores_passing_threshold_count += 1 # logger.info( # "Score passing threshold: %s | %s | %s", # strain_id, assay_id, zscore) # add the G2P assoc prov = Provenance(self.graph) try: assay_label = self.assayhash[m]['assay_label'] assay_description = \ self.assayhash[m]['description'] ont_term_ids = self.assayhash[m].get('ont_terms') comment = ' '.join((assay_label, '(zscore='+str(zscore)+')')) except KeyError: assay_label = None assay_description = None ont_term_ids = None if assay_label is not None: assay_label += ' ('+str(m)+')' # TODO unused # assay_type = self.assayhash[m]['assay_type'] assay_type_id = Provenance.provenance_types['assay'] if ont_term_ids is not None: scores_passing_threshold_with_ontologies_count += 1 prov.add_assay_to_graph( assay_id, assay_label, assay_type_id, assay_description) self._add_g2p_assoc( g, strain_id, sex, assay_id, ont_term_ids, comment) else: scores_not_passing_threshold_count += 1 logger.info("Scores passing threshold: %d", scores_passing_threshold_count) logger.info("Scores passing threshold with ontologies: %d", scores_passing_threshold_with_ontologies_count) logger.info("Scores not passing threshold: %d", scores_not_passing_threshold_count) return
def _fill_provenance_graph(self, limit): LOG.info("Building graph ...") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) scores_passing_threshold_count = 0 scores_passing_threshold_with_ontologies_count = 0 scores_not_passing_threshold_count = 0 # loop through all the strains, # and make G2P assoc for those with scores beyond threshold for strain_num in self.strain_scores_by_measure: if self.test_mode and 'MPD:' + str( strain_num) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(strain_num) for sex in self.strain_scores_by_measure[strain_num]: measures = self.strain_scores_by_measure[strain_num][sex] for m in measures: assay_id = 'MPD-assay:' + str(m) # TODO consider using the means # instead of precomputed zscores if 'zscore' in measures[m]: zscore = measures[m]['zscore'] if abs(zscore) >= self.stdevthreshold: scores_passing_threshold_count += 1 # LOG.info( # "Score passing threshold: %s | %s | %s", # strain_id, assay_id, zscore) # add the G2P assoc prov = Provenance(self.graph) try: assay_label = self.assayhash[m]['assay_label'] assay_description = self.assayhash[m][ 'description'] ont_term_ids = self.assayhash[m].get( 'ont_terms') comment = ' '.join( (assay_label, '(zscore=' + str(zscore) + ')')) except KeyError: assay_label = None assay_description = None ont_term_ids = None if assay_label is not None: assay_label += ' (' + str(m) + ')' assay_type_id = self.globaltt['assay'] if ont_term_ids is not None: scores_passing_threshold_with_ontologies_count += 1 prov.add_assay_to_graph( assay_id, assay_label, assay_type_id, assay_description) self._add_g2p_assoc(graph, strain_id, sex, assay_id, ont_term_ids, comment) else: scores_not_passing_threshold_count += 1 LOG.info("Scores passing threshold: %d", scores_passing_threshold_count) LOG.info("Scores passing threshold with ontologies: %d", scores_passing_threshold_with_ontologies_count) LOG.info("Scores not passing threshold: %d", scores_not_passing_threshold_count) return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ def __init__( self, graph, feature_id=None, label=None, feature_type=None, description=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.fid = feature_id self.label = label self.ftype = feature_type self.description = description self.start = None self.stop = None self.taxon = None return def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) return def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: :return: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) return def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ loc = {} loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.globaltt['Position']) return loc def _getStrandType(self, strand): """ :param strand: :return: """ # TODO make this a dictionary/enum: PLUS, MINUS, BOTH, UNKNOWN strand_id = None if strand == '+': strand_id = self.globaltt['plus_strand'] elif strand == '-': strand_id = self.globaltt['minus_strand'] elif strand == '.': strand_id = self.globaltt['both_strand'] elif strand is None: # assume this is Unknown pass else: LOG.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, add_region=True, region_id=None, feature_as_class=False): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :return: """ if feature_as_class: self.model.addClassToGraph( self.fid, self.label, self.ftype, self.description) else: self.model.addIndividualToGraph( self.fid, self.label, self.ftype, self.description) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes(self.start['type']) if self.stop is not None and self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix rid = '_:'+rid+"-Region" region_id = rid self.graph.addTriple(self.fid, self.globaltt['location'], region_id) self.model.addIndividualToGraph(region_id, None, self.globaltt['Region']) else: region_id = self.fid self.model.addType(region_id, self.globaltt['region']) # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId( self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph( self.start['reference'], self.start['coordinate'], self.start['type']) if self.stop is not None: endp = self._makePositionId( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} return def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.globaltt['plus_strand'] in tylist: strand = 'plus' elif self.globaltt['minus_strand'] in tylist: strand = 'minus' elif self.globaltt['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: """ if reference is None: LOG.error("Trying to make position with no reference.") return None curie = '_:' reference = re.sub(r'\w+\:', '', reference, 1) if re.match(r'^_', reference): # this is in the case if the reference is a bnode reference = re.sub(r'^_', '', reference) curie += reference if coordinate is not None: # just in case it isn't a string already curie = '-'.join((curie, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: curie = '-'.join((curie, tstring)) return curie def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # LOG.warn("No begin position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id) if end_position_id is None: pass # LOG.warn("No end position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['end'], end_position_id) return def addPositionToGraph( self, reference_id, position, position_types=None, strand=None): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ pos_id = self._makePositionId(reference_id, position, position_types) if position is not None: self.graph.addTriple( pos_id, self.globaltt['position'], position, object_is_literal=True, literal_type="xsd:integer") self.graph.addTriple(pos_id, self.globaltt['reference'], reference_id) if position_types is not None: for pos_type in position_types: self.model.addType(pos_id, pos_type) strnd = None if strand is not None: strnd = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it strnd = self._getStrandType(strand) # else: # strnd = self.globaltt['both_strand'] if strnd is None and (position_types is None or position_types == []): strnd = self.globaltt['Position'] if strnd is not None: self.model.addType(pos_id, strnd) return pos_id def addSubsequenceOfFeature(self, parentid): """ This will add reciprocal triples like: feature <is subsequence of> parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.graph.addTriple(self.fid, self.globaltt['is subsequence of'], parentid) # this should be expected to be done in reasoning not ETL self.graph.addTriple(parentid, self.globaltt['has subsequence'], self.fid) return def addTaxonToFeature(self, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ self.taxon = taxonid self.graph.addTriple(self.fid, self.globaltt['in taxon'], self.taxon) return def addFeatureProperty(self, property_type, feature_property): self.graph.addTriple(self.fid, property_type, feature_property) return
def _process_phenotype_tab(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 row = [str(col).strip() for col in row] # Note from Seb in Dec 2017, a 15th column was added # inadverterntly and will be removed in the winter 2018 # release of hpo data (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator, extra) = row disease_id = db + ":" + num if self.testMode: try: id_list = self.test_ids if id_list is None \ or disease_id not in id_list: continue except AttributeError: continue # logger.info('adding %s', disease_id) model.addClassToGraph(disease_id, None) model.addClassToGraph(pheno_id, None) eco_id = self._map_evidence_to_codes(eco) model.addClassToGraph(eco_id, None) if onset is not None and onset != '': model.addClassToGraph(onset, None) # we want to do things differently depending on # the aspect of the annotation # TODO PYLINT Redefinition of assoc type from # dipper.models.assoc.D2PAssoc.D2PAssoc to # dipper.models.assoc.DispositionAssoc.DispositionAssoc if asp == 'O' or asp == 'M': # organ abnormality or mortality assoc = D2PAssoc( g, self.name, disease_id, pheno_id, onset, freq) elif asp == 'I': # inheritance patterns for the whole disease assoc = DispositionAssoc( g, self.name, disease_id, pheno_id) elif asp == 'C': # clinical course / onset assoc = DispositionAssoc( g, self.name, disease_id, pheno_id) else: logger.error("I don't know what this aspect is: %s", asp) assoc.add_evidence(eco_id) publist = re.split(r'[,;]', publist) # blow these apart if there is a list of pubs for pub in publist: pub = pub.strip() pubtype = None if pub != '': # if re.match( # r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', # pub): # #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced # m = re.search(r'part\=(\w+)', pub) # pub_id = 'GeneReviews:'+m.group(1) # elif re.search( # r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', # pub): # m = re.search(r'Expert=(\d+)', pub) # pub_id = 'Orphanet:'+m.group(1) if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub): if re.match(r'PMID', pub): pubtype = \ Reference.ref_types['journal_article'] elif re.match(r'HPO', pub): pubtype = Reference.ref_types['person'] else: pubtype = Reference.ref_types['publication'] r = Reference(g, pub, pubtype) r.addRefToGraph() elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub): # make the pubs a reference to the website, # instead of the curie if re.match(r'OMIM', pub): omimnum = re.sub(r'OMIM:', '', pub) omimurl = '/'.join(('http://omim.org/entry', str(omimnum).strip())) pub = omimurl elif re.match(r'Orphanet:', pub): orphanetnum = re.sub(r'Orphanet:', '', pub) orphaneturl = \ ''.join(( 'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=', str(orphanetnum))) pub = orphaneturl elif re.match(r'DECIPHER:', pub): deciphernum = re.sub(r'DECIPHER:', '', pub) decipherurl = '/'.join( ('https://decipher.sanger.ac.uk/syndrome', deciphernum)) pub = decipherurl pubtype = Reference.ref_types['webpage'] elif re.match(r'http', pub): pass else: logger.error('Unknown pub type for %s: %s', disease_id, pub) print(disease_id, 'pubs:', str(publist)) continue if pub is not None: assoc.add_source(pub) # TODO add curator assoc.add_association_to_graph() if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_kegg_disease2gene(self, limit=None): """ This method creates an association between diseases and their associated genes. We are being conservative here, and only processing those diseases for which there is no mapping to OMIM. Triples created: <alternate_locus> is an Individual <alternate_locus> has type <variant_locus> <alternate_locus> is an allele of <gene_id> <assoc_id> has subject <disease_id> <assoc_id> has object <gene_id> :param limit: :return: """ src_key = 'disease_gene' LOG.info("Processing KEGG disease to gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) rel = self.globaltt['is marker for'] noomimset = set() raw = '/'.join((self.rawdir, self.files[src_key]['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (gene_id, disease_id) = row if self.test_mode and gene_id not in self.test_ids['genes']: continue gene_id = 'KEGG-' + gene_id.strip() disease_id = 'KEGG-' + disease_id.strip() # only add diseases for which # there is no omim id and not a grouping class if disease_id not in self.kegg_disease_hash: # add as a class disease_label = None if disease_id in self.label_hash: disease_label = self.label_hash[disease_id] if re.search(r'includ', str(disease_label)): # they use 'including' when it's a grouping class LOG.info( "Skipping association because it's a grouping class: %s", disease_label) continue # type this disease_id as a disease model.addClassToGraph(disease_id, disease_label, class_category=blv.terms['Disease']) # , class_type=self.globaltt['disease']) noomimset.add(disease_id) alt_locus_id = self._make_variant_locus_id( gene_id, disease_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph(alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. assoc = G2PAssoc(graph, self.name, alt_locus_id, disease_id, rel) assoc.add_association_to_graph() if not self.test_mode and (limit is not None and reader.line_num > limit): break LOG.info("Done with KEGG disease to gene") LOG.info("Found %d diseases with no omim id", len(noomimset))
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data . Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Turtle: <eom id> a owl:Class rdfs:label Literal(eom label) oboInOwl:has_related_synonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) TEC_note: URL are not literals. :param raw: :param limit: :return: """ src_key = 'tables' model = Model(self.graph) col = self.resources[src_key]['columns'] with open(raw, 'r') as rawread: reader = csv.reader(rawread, delimiter='\t', quotechar='\"') row = next(reader) if not self.check_fileheader(col, row): pass for row in reader: # head -1 dvp.pr_nlx_157874_1|tr '\t' '\n'| # sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" morphology_term_id = row[col.index( 'morphology_term_id')].strip() # morphology_term_num = row[col.index('morphology_term_num')] morphology_term_label = row[col.index( 'morphology_term_label')].strip() morphology_term_url = row[col.index( 'morphology_term_url')].strip() # terminology_category_label = row[ # col.index('terminology_category_label')] # terminology_category_url = row[col.index('terminology_category_url')] # subcategory = row[col.index('subcategory')] objective_definition = row[col.index( 'objective_definition')].strip() subjective_definition = row[col.index( 'subjective_definition')].strip() comments = row[col.index('comments')].strip() synonyms = row[col.index('synonyms')].strip() replaces = row[col.index('replaces')].strip() small_figure_url = row[col.index('small_figure_url')].strip() large_figure_url = row[col.index('large_figure_url')].strip() # e_uid = row[col.index('e_uid')] # v_uid = row[col.index('v_uid')] # v_uuid = row[col.index('v_uuid')] # v_lastmodified = row[col.index('v_lastmodified')] # v_status = row[col.index('v_status')] # v_lastmodified_epoch = row[col.index('v_lastmodified_epoch')] # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label, blv.terms['PhenotypicFeature']) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition + '.' definition = ' '.join( (objective_definition, subjective_definition)) model.addDefinition( morphology_term_id, definition, class_category=blv.terms['PhenotypicFeature']) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments) for syn in synonyms.split(';'): model.addSynonym(morphology_term_id, syn.strip(), self.globaltt['has_exact_synonym']) # morphology_term_id has_related_synonym replaces (; delimited) if replaces not in ['', synonyms]: for syn in replaces.split(';'): syn.strip() if syn != '': model.addSynonym( morphology_term_id, syn, self.globaltt['has_related_synonym']) # <morphology_term_id> <foaf:page> morphology_term_url if morphology_term_id is not None: reference = Reference(self.graph, morphology_term_id, self.globaltt['web page']) # TEC 201905: # Not so sure we need explicit <eom_uri> <webpage> <eom_url>. # since <eom_uri> IS the <eom_url>. reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and reader.line_num > limit: break
def _get_equivids(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes(not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. :param limit: """ raw = '/'.join((self.rawdir, self.files['idmap']['file'])) model = Model(self.graph) LOG.info('Looping over %s', raw) # we look some stuff up in OMIM, so initialize here # omim = OMIM(self.graph_type, self.are_bnodes_skized) id_map = {} allomimids = set() col = ['NBK_id', 'GR_shortname', 'OMIM'] with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(reader) row[0] = row[0][1:] if not self.check_fileheader(col, row): pass for row in reader: nbk_num = row[col.index('NBK_id')] shortname = row[col.index('GR_shortname')] omim_num = row[col.index('OMIM')] gr_id = 'GeneReviews:' + nbk_num omim_id = 'OMIM:' + omim_num if not ((self.test_mode and len(self.test_ids) > 0 and omim_id in self.test_ids) or not self.test_mode): continue # sometimes there's bad omim nums omim_num = omim_num.strip() if len(omim_num) != 6: LOG.warning( "OMIM number incorrectly formatted in row %i; skipping:\n%s", reader.line_num, '\t'.join(row)) continue # build up a hashmap of the mappings; then process later if nbk_num not in id_map: id_map[nbk_num] = set() id_map[nbk_num].add(omim_num) # add the class along with the shortname model.addClassToGraph(gr_id, None) model.addSynonym(gr_id, shortname) allomimids.add(omim_num) if not self.test_mode and limit is not None and reader.line_num > limit: break # end looping through file # given all_omim_ids from GR, # we want to update any which are changed or removed # before deciding which are disease / phenotypes replaced = allomimids & self.omim_replaced.keys() if replaced is not None and len(replaced) > 0: LOG.warning("These OMIM ID's are past their pull date: %s", str(replaced)) for oid in replaced: allomimids.remove(oid) replacements = self.omim_replaced[oid] for rep in replacements: allomimids.update(rep) # guard against omim identifiers which have been removed obsolete = [ o for o in self.omim_type if self.omim_type[o] == self.globaltt['obsolete'] ] removed = allomimids & set(obsolete) if removed is not None and len(removed) > 0: LOG.warning("These OMIM ID's are gone: %s", str(removed)) for oid in removed: allomimids.remove(oid) # filter for disease /phenotype types (we can argue about what is included) omim_phenotypes = set([ omim for omim in self.omim_type if self.omim_type[omim] in ( self.globaltt['phenotype'], self.globaltt[ 'has_affected_feature'], # both a gene and a phenotype self.globaltt['heritable_phenotypic_marker']) ]) # probable phenotype LOG.info("Have %i omim_ids globally typed as phenotypes from OMIM", len(omim_phenotypes)) entries_that_are_phenotypes = allomimids & omim_phenotypes LOG.info("Filtered out %d/%d entries that are genes or features", len(allomimids - entries_that_are_phenotypes), len(allomimids)) for nbk_num in self.book_ids: gr_id = 'GeneReviews:' + nbk_num if nbk_num in id_map: omim_ids = id_map.get(nbk_num) for omim_num in omim_ids: omim_id = 'OMIM:' + omim_num # add the gene reviews as a superclass to the omim id, # but only if the omim id is not a gene if omim_id in entries_that_are_phenotypes: model.addClassToGraph(omim_id, None) model.addSubClass(omim_id, gr_id) # add this as a generic subclass -- TEC: this is the job of inference model.addSubClass(gr_id, self.globaltt['disease or disorder'])
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ src_key = 'ortholog_classes' LOG.info("Processing ortholog classes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files[src_key]['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (orthology_class_id, orthology_class_name) = row if self.test_mode and orthology_class_id \ not in self.test_ids[src_key]: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-' + orthology_class_id.strip() orthology_type = self.globaltt['gene_family'] model.addClassToGraph(orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: model.addSynonym(orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels) - 1] model.addDescription(orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: model.addXref(orthology_class_id, 'EC:' + ecm) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with ortholog classes")
def _get_chrbands(self, limit, taxon, genome_id=None): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :param: taxon: :param: genome :return: """ model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) if genome_id is None: genome_id = geno.makeGenomeID(taxon_id) # makes a blank node allways geno.addGenome(taxon_id, genome_label, genome_id) model.addOWLPropertyClassRestriction( genome_id, self.globaltt['in taxon'], taxon_id) placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # currently unused patterns # unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' col = ['chrom', 'start', 'stop', 'band', 'rtype'] with gzip.open(myfile, 'rb') as reader: for line in reader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#': continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') chrom = row[col.index('chrom')] band = row[col.index('band')] rtype = row[col.index('rtype')] # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = re.match(placed_scaffold_pattern+r'$', chrom) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern # chrom = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold LOG.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['member of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph( maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = self.globaltt['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ self.globaltt['chromosome_band'], self.globaltt['chromosome_subband']]: stain_type = self.resolve(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['has_sequence_attribute'], self.resolve(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info LOG.info("feature type %s != chr band", region_type_id) else: LOG.info('staining type not found for: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of", maplocclass_id, "=", parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file for prnt in parents: parent = prnt.strip() if parent is None or parent == "": continue pclassid = cclassid + parent # class chr parts pclass_label = makeChromLabel(chrom + parent, genome_label) rti = getChrPartTypeByNotation(parent, self.graph) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if prnt != parents[-1]: grandparent = 1 + parents.index(prnt) pid = cclassid + parents[grandparent] # the instance model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], pid) model.addOWLPropertyClassRestriction( pid, self.globaltt['has subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, self.globaltt['is subsequence of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, self.globaltt['has subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, self.globaltt['is subsequence of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], self.globaltt['has subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index( 'marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index( 'phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index( 'allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index( 'strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index( 'pipeline_stable_id')].strip() procedure_stable_id = row[col.index( 'procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index( 'parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index( 'statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-' + re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info("Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:' + strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group( 1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:' + re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC(vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple(colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id((colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC(vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype(genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join( (re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype(sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts(genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': LOG.warning("No phenotype id specified for row %d: %s", reader.line_num, str(row)) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(graph, self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join( (mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join( (mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ logger.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.testMode: graph = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: graph = self.graph model = Model(graph) # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:' + biogrid_num prefix = self.localtt[id_type] # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) \ and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) \ and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) model.addEquivalentClass(biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': model.addClassToGraph(biogrid_id, id_num) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.testMode and limit is not None and line_counter > limit: break myzip.close() return
def _process_phenotype_hpoa(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filedate = datetime.utcfromtimestamp( os.stat(raw)[ST_CTIME]).strftime("%Y-%m-%d") # this will cause two dates to be attached to the dataset # (one from the filedate, and the other from here) # TODO when #112 is implemented, # this will result in only the whole dataset being versioned col = self.files['hpoa']['columns'] with open(raw, 'r', encoding="utf8") as tsvfile: reader = csv.reader(tsvfile, delimiter='\t', quotechar='\"') vers = next(reader) # drop vers = str(next(reader))[9:19] print(vers) date = datetime.strptime(vers.strip(), '%Y-%m-%d').strftime("%Y-%m-%d-%H-%M") self.dataset.setVersion(filedate, date) for row in reader: if row[0][0] == '#' or row[0] == 'DatabaseID': # headers continue row = [str(col).strip() for col in row] disease_id = row[col.index('DatabaseID')] # 98246 OMIM # 68646 ORPHA # 297 DECIPHER if self.test_mode: try: id_list = self.test_ids if id_list is None or disease_id not in id_list: continue except AttributeError: continue pheno_id = row[col.index('HPO_ID')] eco_id = self.resolve(row[col.index('Evidence')]) onset = row[col.index('Onset')] asp = row[col.index('Aspect')] freq = row[col.index('Frequency')] publist = row[col.index('Reference')] sex = row[col.index('Sex')].lower() # LOG.info( # 'adding <%s>-to-<%s> because <%s>', disease_id, pheno_id, eco_id) model.addClassToGraph(disease_id) model.addClassToGraph(pheno_id) model.addClassToGraph(eco_id) if onset is not None and onset != '': model.addClassToGraph(onset) if asp in ('P', 'M'): # phenotype? abnormality or mortality assoc = D2PAssoc( # default rel=self.globaltt['has phenotype'] graph, self.name, disease_id, pheno_id, onset, freq) elif asp in ( 'I', 'C'): # inheritance pattern or clinical course/onset assoc = D2PAssoc(graph, self.name, disease_id, pheno_id, rel=self.globaltt['has disposition']) else: LOG.error("Unknown aspect : %s at line %i", asp, reader.line_num) assoc.add_evidence(eco_id) if sex is not None and sex != '': self.graph.addTriple(assoc.get_association_id(), self.globaltt['has_sex_specificty'], self.globaltt[sex]) # Publication # cut -f 5 phenotype.hpoa | grep ";" | tr ';' '\n' | cut -f1 -d ':' |\ # sort | uniq -c | sort -nr # 629 PMID # 63 OMIM # 42 ISBN-13 # 36 http for pub in publist.split(';'): pub = pub.strip() pubtype = None if pub[:5] == 'PMID:': pubtype = self.globaltt['journal article'] elif pub[:4] == 'ISBN': pubtype = self.globaltt['publication'] elif pub[:5] == 'OMIM:': pub = 'http://omim.org/entry/' + pub[5:] pubtype = self.globaltt['web page'] elif pub[:9] == 'DECIPHER:': pubtype = self.globaltt['web page'] elif pub[:6] == 'ORPHA:': pubtype = self.globaltt['web page'] elif pub[:4] == 'http': pubtype = self.globaltt['web page'] else: LOG.error('Unknown pub type for disease %s from "%s"', disease_id, pub) continue if pub is not None: assoc.add_source(pub) if pubtype is not None: ref = Reference(graph, pub, pubtype) # ref.setTitle(''); ref.setYear() ref.addRefToGraph() # TODO add curator # pprint.pprint(assoc) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> <kegg_gene_id> biolink:category biolink:Gene <omim_gene_id> biolink:category biolink:Gene :param limit: :return: """ src_key = 'omim2gene' LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[src_key]['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids[ 'genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None, class_category=blv.terms['Gene']) geno.addGene(kegg_gene_id, None) # previous: if omim type is not disease-ish then use # now is: if omim type is gene then use if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id( kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph(alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and (limit is not None and reader.line_num > limit): break LOG.info("Done with OMIM to KEGG gene")
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): exit(-1) for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index('pubmed_id')].strip() # pipe seperated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe seperated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple( 'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _process_omim2disease(self, limit=None): """ This method maps the KEGG disease IDs to the corresponding OMIM disease IDs. Currently this only maps KEGG diseases and OMIM diseases that are 1:1. Triples created: <kegg_disease_id> is a class <omim_disease_id> is a class <kegg_disease_id> hasXref <omim_disease_id> <kegg_disease_id> biolink:category biolink:Disease <omim_disease_id> biolink:category biolink:Disease :param limit: :return: """ src_key = 'omim' LOG.info("Processing 1:1 KEGG disease to OMIM disease mappings") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files[src_key]['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (omim_disease_id, kegg_disease_id, link_type) = row kegg_disease_id = 'KEGG-' + kegg_disease_id.strip() omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id) # Create hash for the links from OMIM ID -> KEGG ID if omim_disease_id not in self.omim_disease_hash: self.omim_disease_hash[omim_disease_id] = [kegg_disease_id] else: self.omim_disease_hash[omim_disease_id].append( kegg_disease_id) # Create hash for the links from KEGG ID -> OMIM ID if kegg_disease_id not in self.kegg_disease_hash: self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id] else: self.kegg_disease_hash[kegg_disease_id].append( omim_disease_id) # Now process the disease hashes # and only pass 1:1 omim disease:KEGG disease entries. for omim_disease_id in self.omim_disease_hash: if self.test_mode and omim_disease_id not in self.test_ids[ 'disease']: continue if (not self.test_mode) and (limit is not None and reader.line_num > limit): break if len(self.omim_disease_hash[omim_disease_id]) == 1: kegg_disease_id = ''.join( self.omim_disease_hash.get(omim_disease_id)) if len(self.kegg_disease_hash[kegg_disease_id]) == 1: # add ids, and deal with the labels separately model.addClassToGraph(kegg_disease_id, None, class_category=blv.terms['Disease']) model.addClassToGraph(omim_disease_id, None) # TODO is this safe? model.addEquivalentClass( kegg_disease_id, omim_disease_id, subject_category=blv.terms['Disease']) else: pass # gu.addXref(g, omim_disease_id, kegg_disease_id) # TODO add xrefs if >1:1 mapping? LOG.info("Done with KEGG disease to OMIM disease mappings.")
def add_orthologs_by_gene_group(self, graph, gene_ids): """ This will get orthologies between human and other vertebrate genomes based on the gene_group annotation pipeline from NCBI. More information 9can be learned here: http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/ The method for associations is described in [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/) == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/). Because these are only between human and vertebrate genomes, they will certainly miss out on very distant orthologies, and should not be considered complete. We do not run this within the NCBI parser itself; rather it is a convenience function for others parsers to call. :param graph: :param gene_ids: Gene ids to fetch the orthology :return: """ logger.info("getting gene groups") line_counter = 0 f = '/'.join((self.rawdir, self.files['gene_group']['file'])) found_counter = 0 # because many of the orthologous groups are grouped by human gene, # we need to do this by generating two-way hash # group_id => orthologs # ortholog id => group # this will be the fastest approach, though not memory-efficient. geno = Genotype(graph) model = Model(graph) group_to_orthology = {} gene_to_group = {} gene_to_taxon = {} with gzip.open(f, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: # skip comment lines if re.match(r'\#', ''.join(row)): continue line_counter += 1 (tax_a, gene_a, rel, tax_b, gene_b) = row if rel != 'Ortholog': continue if gene_a not in group_to_orthology: group_to_orthology[gene_a] = set() group_to_orthology[gene_a].add(gene_b) if gene_b not in gene_to_group: gene_to_group[gene_b] = set() gene_to_group[gene_b].add(gene_a) gene_to_taxon[gene_a] = tax_a gene_to_taxon[gene_b] = tax_b # also add the group lead as a member of the group group_to_orthology[gene_a].add(gene_a) # end loop through gene_group file logger.debug("Finished hashing gene groups") logger.debug("Making orthology associations") for gid in gene_ids: gene_num = re.sub(r'NCBIGene:', '', gid) group_nums = gene_to_group.get(gene_num) if group_nums is not None: for group_num in group_nums: orthologs = group_to_orthology.get(group_num) if orthologs is not None: for o in orthologs: oid = 'NCBIGene:'+str(o) model.addClassToGraph( oid, None, Genotype.genoparts['gene']) otaxid = 'NCBITaxon:'+str(gene_to_taxon[o]) geno.addTaxon(otaxid, oid) assoc = OrthologyAssoc(graph, self.name, gid, oid) assoc.add_source('PMID:24063302') assoc.add_association_to_graph() # todo get gene label for orthologs - # this could get expensive found_counter += 1 # finish loop through annotated genes logger.info( "Made %d orthology relationships for %d genes", found_counter, len(gene_ids)) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ src_key = 'gene_info' if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # not unzipping the file LOG.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", gene_info) # Add taxa and genome classes for those in our filter band_regex = re.compile(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$') for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', tax_num)) # tax label can get added elsewhere geno.addGenome(tax_id, tax_num) # label added elsewhere model.addClassToGraph(tax_id, None) col = self.files[src_key]['columns'] with gzip.open(gene_info, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: line = line.strip() line_counter += 1 if line[0] == '#': # skip comments continue row = line.decode().strip().split('\t') # ##set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (tax_num not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter tax_num = row[col.index('tax_id')] gene_num = row[col.index('GeneID')] symbol = row[col.index('Symbol')] # = row[col.index('LocusTag')] synonyms = row[col.index('Synonyms')].strip() dbxrefs = row[col.index('dbXrefs')].strip() chrom = row[col.index('chromosome')].strip() map_loc = row[col.index('map_location')].strip() desc = row[col.index('description')] gtype = row[col.index('type_of_gene')].strip() # = row[col.index('Symbol_from_nomenclature_authority')] name = row[col.index('Full_name_from_nomenclature_authority')] # = row[col.index('Nomenclature_status')] other_designations = row[col.index( 'Other_designations')].strip() # = row[col.index('Modification_date')} # = row[col.index('Feature_type')] if self.test_mode and int(gene_num) not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue tax_id = ':'.join(('NCBITaxon', tax_num)) gene_id = ':'.join(('NCBIGene', gene_num)) gene_type_id = self.resolve(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == self.globaltt['sequence_feature']: self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.test_mode and limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader (for non mods), # so we will not add the leader designation here. else: model.addIndividualToGraph(gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader if name != '-': model.addSynonym(gene_id, name) if synonyms != '-': for syn in synonyms.split('|'): model.addSynonym(gene_id, syn.strip(), model.globaltt['has_related_synonym']) if other_designations != '-': for syn in other_designations.split('|'): model.addSynonym(gene_id, syn.strip(), model.globaltt['has_related_synonym']) if dbxrefs != '-': self._add_gene_equivalencies(dbxrefs, gene_id, tax_id) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if chrom != '-' and chrom != '': if re.search(r'\|', chrom) and chrom not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping LOG.info( '%s is non-uniquely mapped to %s. Skipping for now.', gene_id, chrom) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if chrom == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for chromosome in re.split(r'\|', chrom): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(chromosome, tax_id, None) mychrom = makeChromID(chromosome, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(chromosome, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match(band_regex, map_loc) if band_match is not None and len( band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^' + chromosome, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(chromosome + bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(graph, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature graph.addTriple(gene_id, self.globaltt['is subsequence of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 LOG.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome graph.addTriple(gene_id, self.globaltt['is subsequence of'], mychrom) geno.addTaxon(tax_id, gene_id)
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. These are filtered on the taxon. :param limit: :return: """ src_key = 'gene2pubmed' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", myfile) assoc_counter = 0 col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: line_counter += 1 # skip comments row = line.decode().strip().split('\t') if row[0][0] == '#': continue tax_num = row[col.index('tax_id')].strip() gene_num = row[col.index('GeneID')].strip() pubmed_num = row[col.index('PubMed_ID')].strip() # ## set id_filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.test_mode and int(gene_num) not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) else: model.addIndividualToGraph(gene_id, None) # add the publication as a NamedIndividual # add type publication model.addIndividualToGraph(pubmed_id, None, None) reference = Reference( graph, pubmed_id, self.globaltt['journal article']) reference.addRefToGraph() graph.addTriple( pubmed_id, self.globaltt['is_about'], gene_id) assoc_counter += 1 if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Processed %d pub-gene associations", assoc_counter)
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) model = Model(g) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:'+str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:'+str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) model.addClassToGraph( gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): model.addSynonym(gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(g, self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:'+r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:'+r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:'+r.find('Reference').text else: pass # skip the others for now if eqid is not None: model.addClassToGraph(eqid, None) model.addEquivalentClass(gene_id, eqid) elem.clear() # empty the element if self.testMode and limit is not None and line_counter > limit: return return
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ # special genotype parts mapped to their # GENO and SO classes that we explicitly reference here genoparts = { 'intrinsic_genotype': 'GENO:0000000', 'extrinsic_genotype': 'GENO:0000524', 'effective_genotype': 'GENO:0000525', 'sex_qualified_genotype': 'GENO:0000645', 'male_genotype': 'GENO:0000646', 'female_genotype': 'GENO:0000647', 'genomic_background': 'GENO:0000611', 'unspecified_genomic_background': 'GENO:0000649', 'genomic_variation_complement': 'GENO:0000009', 'karyotype_variation_complement': 'GENO:0000644', 'variant_single_locus_complement': 'GENO:0000030', 'variant_locus': 'GENO:0000002', 'reference_locus': 'GENO:0000036', 'allele': 'GENO:0000512', 'gene': 'SO:0000704', 'QTL': 'SO:0000771', 'transgene': 'SO:0000902', # not really used any more 'transgenic_insertion': 'SO:0001218', 'pseudogene': 'SO:0000336', 'cytogenetic marker': 'SO:0000341', 'sequence_feature': 'SO:0000110', 'sequence_alteration': 'SO:0001059', 'insertion': 'SO:0000667', 'deletion': 'SO:0000159', 'substitution': 'SO:1000002', 'duplication': 'SO:1000035', 'translocation': 'SO:0000199', 'inversion': 'SO:1000036', 'tandem_duplication': 'SO:1000173', 'point_mutation': 'SO:1000008', 'population': 'PCO:0000001', # population 'family': 'PCO:0000020', # family 'wildtype': 'GENO:0000511', 'reagent_targeted_gene': 'GENO:0000504', 'targeted_gene_subregion': 'GENO:0000534', 'targeted_gene_complement': 'GENO:0000527', 'biological_region': 'SO:0001411', 'missense_variant': 'SO:0001583', 'transcript': 'SO:0000233', 'polypeptide': 'SO:0000104', 'cDNA': 'SO:0000756', 'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118', 'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125', 'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120', 'sequence_variant_affecting_polypeptide_function': 'SO:1000117', 'regulatory_transgene_feature': 'GENO:0000637', 'coding_transgene_feature': 'GENO:0000638', 'protein_coding_gene': 'SO:0001217', 'ncRNA_gene': 'SO:0001263', 'RNAi_reagent': 'SO:0000337', 'heritable_phenotypic_marker': 'SO:0001500' } object_properties = { 'is_mutant_of': 'GENO:0000440', 'derives_from': 'RO:0001000', 'has_alternate_part': 'GENO:0000382', 'has_reference_part': 'GENO:0000385', 'has_sex_agnostic_genotype_part': 'GENO:0000650', 'in_taxon': 'RO:0002162', 'has_zygosity': 'GENO:0000608', # is_seq_var_inst_of links a alternate locus (instance) # to a gene (class) 'is_sequence_variant_instance_of': 'GENO:0000408', 'targets_instance_of': 'GENO:0000414', 'is_reference_instance_of': 'GENO:0000610', 'has_part': 'BFO:0000051', # use has_member_with_allelotype when relating populations 'has_member_with_allelotype': 'GENO:0000225', 'is_allelotype_of': 'GENO:0000206', 'has_genotype': 'GENO:0000222', 'has_phenotype': 'RO:0002200', 'has_gene_product': 'RO:0002205', 'translates_to': 'RO:0002513', 'is_targeted_expression_variant_of': 'GENO:0000443', 'is_transgene_variant_of': 'GENO:0000444', 'has_variant_part': 'GENO:0000382', # targeted_by isa between a (reagent-targeted gene) and a morpholino 'targeted_by': 'GENO:0000634', # FIXME should derives_sequence_from_gene just be subsequence of? 'derives_sequence_from_gene': 'GENO:0000639', 'has_affected_locus': 'GENO:0000418' } annotation_properties = { # TODO change properties with # https://github.com/monarch-initiative/GENO-ontology/issues/21 # FIXME # reference_nucleotide, reference_amino_acid, altered_nucleotide # results_in_amino_acid_change are FIXME Made up terms 'reference_nucleotide': 'GENO:reference_nucleotide', 'reference_amino_acid': 'GENO:reference_amino_acid', 'altered_nucleotide': 'GENO:altered_nucleotide', 'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change' } zygosity = { 'homoplasmic': 'GENO:0000602', 'heterozygous': 'GENO:0000135', 'indeterminate': 'GENO:0000137', 'heteroplasmic': 'GENO:0000603', 'hemizygous-y': 'GENO:0000604', 'hemizygous-x': 'GENO:0000605', 'homozygous': 'GENO:0000136', 'hemizygous': 'GENO:0000606', 'complex_heterozygous': 'GENO:0000402', 'simple_heterozygous': 'GENO:0000458' } properties = object_properties.copy() properties.update(annotation_properties) def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) return def addGenotype( self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic_genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.genoparts['intrinsic_genotype'] self.model.addIndividualToGraph( genotype_id, genotype_label, genotype_type, genotype_description) return def addAllele( self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if allele_type is None: allele_type = self.genoparts['allele'] # TODO is this a good idea? self.model.addIndividualToGraph( allele_id, allele_label, allele_type, allele_description) return def addGene( self, gene_id, gene_label, gene_type=None, gene_description=None): if gene_type is None: gene_type = self.genoparts['gene'] # genes are classes self.model.addClassToGraph( gene_id, gene_label, gene_type, gene_description) return def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None): # TODO add base type for construct # if (constrcut_type is None): # constrcut_type=self.construct_base_type self.model.addIndividualToGraph(construct_id, construct_label, construct_type, construct_description) return def addDerivesFrom(self, child_id, parent_id): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.graph.addTriple( child_id, self.properties['derives_from'], parent_id) return def addSequenceDerivesFrom(self, child_id, parent_id): self.graph.addTriple( child_id, self.properties['derives_sequence_from_gene'], parent_id) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_sequence_variant_instance_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.properties['is_sequence_variant_instance_of'] self.graph.addTriple(allele_id, rel_id, gene_id) return def addAffectedLocus(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_sequence_variant_instance_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.properties['has_affected_locus'] self.graph.addTriple(allele_id, rel_id, gene_id) return def addGeneProduct( self, sequence_id, product_id, product_label=None, product_type=None): """ Add gene/variant/allele has_gene_product relationship Can be used to either describe a gene to transcript relationship or gene to protein :param sequence_id: :param product_id: :param product_label: :param product_type: :return: """ if product_label is not None and product_type is not None: self.model.addIndividualToGraph( product_id, product_label, product_type) self.graph.addTriple( sequence_id, self.properties['has_gene_product'], product_id) return def addPolypeptide( self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.genoparts['polypeptide'] self.model.addIndividualToGraph( polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.graph.addTriple( transcript_id, self.properties['translates_to'], polypeptide_id) return def addPartsToVSLC( self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 # vslc = gu.getNode(vslc_id) # TODO unused if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.zygosity['homozygous'] else: zygosity_id = self.zygosity['heterozygous'] if zygosity_id is not None: self.graph.addTriple( vslc_id, self.properties['has_zygosity'], zygosity_id) return def addVSLCtoParent(self, vslc_id, parent_id): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :return: """ self.addParts( vslc_id, parent_id, self.properties['has_alternate_part']) return def addParts(self, part_id, parent_id, part_relationship=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :return: """ if part_relationship is None: part_relationship = self.properties['has_part'] self.graph.addTriple(parent_id, part_relationship, part_id) return def addSequenceAlteration( self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.genoparts['sequence_alteration'] self.model.addIndividualToGraph( sa_id, sa_label, sa_type, sa_description) return def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.properties['has_alternate_part']) return def addGenomicBackground( self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.genoparts['genomic_background'] self.model.addIndividualToGraph( background_id, background_label, background_type, background_description) return def addGenomicBackgroundToGenotype( self, background_id, genotype_id, background_type=None): if background_type is None: background_type = self.genoparts['genomic_background'] self.model.addType(background_id, background_type) self.addParts(background_id, genotype_id, self.object_properties['has_reference_part']) return def addTaxon(self, taxon_id, genopart_id): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :return: """ self.graph.addTriple( genopart_id, self.properties['in_taxon'], taxon_id) return def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): # for example, add a morphant reagent thingy to the genotype, # assuming it's a extrinsic_genotype self.graph.addTriple( genotype_id, self.properties['has_variant_part'], reagent_id) return def addGeneTargetingReagent( self, reagent_id, reagent_label, reagent_type, gene_id, description=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.model.addIndividualToGraph( reagent_id, reagent_label, reagent_type, description) self.graph.addTriple( reagent_id, self.object_properties['targets_instance_of'], gene_id) return def addReagentTargetedGene( self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdf:label targeted_gene_label dc:description description <reagent_id> GENO:targets_instance_of <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :return: """ # akin to a variant locus if targeted_gene_id is None: targeted_gene_id = '_' + gene_id + '-' + reagent_id targeted_gene_id = targeted_gene_id.replace(":", "") self.model.addIndividualToGraph( targeted_gene_id, targeted_gene_label, self.genoparts['reagent_targeted_gene'], description) if gene_id is not None: self.graph.addTriple( targeted_gene_id, self.object_properties['is_targeted_expression_variant_of'], gene_id) self.graph.addTriple( targeted_gene_id, self.properties['targeted_by'], reagent_id) return def addTargetedGeneSubregion( self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.genoparts['targeted_gene_subregion'] self.model.addIndividualToGraph( tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.graph.addTriple( population_id, self.properties['has_member_with_allelotype'], member_id) return def addTargetedGeneComplement( self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.genoparts['targeted_gene_complement'] self.model.addIndividualToGraph( tgc_id, tgc_label, tgc_type, tgc_description) return def addGenome(self, taxon_id, taxon_label=None): if taxon_label is None: taxon_label = taxon_id genome_label = taxon_label+' genome' genome_id = self.makeGenomeID(taxon_id) self.model.addClassToGraph( genome_id, genome_label, Feature.types['genome']) return def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.model.addIndividualToGraph( build_id, build_label, Feature.types['reference_genome']) self.model.addType(build_id, genome_id) self.addTaxon(taxon_id, build_id) return def makeGenomeID(self, taxon_id): # scrub off the taxon prefix. put it in base space # TODO: revisit as BNODE? genome_id = re.sub(r'.*\:', ':', taxon_id) + 'genome' return genome_id def addChromosome( self, chr, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family() # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chr), tax_id) if tax_label is not None: chr_label = makeChromLabel(chr, tax_label) else: chr_label = makeChromLabel(chr) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph( chr_id, chr_label, Feature.types['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chr, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chr, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph( chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id) family.addMemberOf(chrinbuild_id, build_id) return def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.model.addClassToGraph( chrom_class_id, chrom_class_label, Feature.types['chromosome']) return def addChromosomeInstance( self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph( chr_id, chr_label, Feature.types['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) return def make_variant_locus_label(self, gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip()+'<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if gene_label is None and \ allele1_label is None and allele2_label is None: logger.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label def make_experimental_model_with_genotype( self, genotype_id, genotype_label, taxon_id, taxon_label): animal_id = '-'.join((taxon_id, 'with', genotype_id)) animal_id = re.sub(r':', '', animal_id) animal_id = '_:'+animal_id animal_label = ' '.join((genotype_label, taxon_label)) self.model.addIndividualToGraph(animal_id, animal_label, taxon_id) self.graph.addTriple( animal_id, Genotype.object_properties['has_genotype'], genotype_id) return animal_id
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ logger.info("Processing OMIM to KEGG gene") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.testMode and \ kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) if not DipperUtil.is_omim_disease(omim_id): model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id( kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = model.object_properties['is_marker_for'] assoc = G2PAssoc(g, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are logger.warning('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with OMIM to KEGG gene") return
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[common_name + '_cm']['curie'] if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _process_data(self, src_key, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Data from %s", raw) if self.test_mode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[src_key]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.test_mode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph(family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': varl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((varl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = varl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = varl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for disease in omim_num.split(';'): if disease is not None and disease != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if disease not in omim_map: disease_id = 'OMIM:' + disease.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', disease) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for pmid in pubmed_ids.split(';'): pubmed_id = 'PMID:' + pmid.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.test_mode and (limit is not None and line_counter > limit): break return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not ( re.match(r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not ( re.match(r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def _process_genes(self, taxid, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) col = list(self.columns['bmq_attributes']) if taxid != '9606' and 'hgnc_id' in col: col.remove('hgnc_id') col_exp = [ self.columns['bmq_headers'][self.columns['bmq_attributes'].index( x)] for x in col ] LOG.info("Processing Ensembl genes for NCBITaxon:%s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') row = next(reader) if not self.check_fileheader(col_exp, row): pass for row in reader: ensembl_gene_id = row[col.index('ensembl_gene_id')] external_gene_name = row[col.index('external_gene_name')] description = row[col.index('description')].strip() gene_biotype = row[col.index('gene_biotype')].strip() entrezgene = row[col.index('entrezgene_id')].strip() ensembl_peptide_id = row[col.index( 'ensembl_peptide_id')].strip() uniprotswissprot = row[col.index('uniprotswissprot')].strip() hgnc_curie = None # in the case of human genes, we also get the hgnc id, if taxid == '9606' and 'hgnc_id' in col: hgnc_curie = row[col.index('hgnc_id')].strip() if self.test_mode and entrezgene != '' and \ entrezgene not in self.gene_ids: continue gene_id = 'ENSEMBL:' + ensembl_gene_id entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None gene_type_id = self.resolve( gene_biotype, mandatory=False, default=self.globaltt['polypeptide']) model.addClassToGraph(gene_id, external_gene_name, gene_type_id, description, class_category=blv.terms['Gene']) if entrezgene != '': if taxid == '9606': # Use HGNC for eq in human data model.addXref(gene_id, entrez_curie, xref_category=blv.terms['Gene']) else: model.addEquivalentClass( gene_id, entrez_curie, object_category=blv.terms['Gene']) if hgnc_curie is not None and hgnc_curie != '': model.addEquivalentClass(gene_id, hgnc_curie, object_category=blv.terms['Gene']) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if ensembl_peptide_id is not None and ensembl_peptide_id != '': peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id) model.addIndividualToGraph(peptide_curie, None, gene_type_id) geno.addGeneProduct(gene_id, peptide_curie) if uniprotswissprot != '': uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot) model.addIndividualToGraph(uniprot_curie, None, gene_type_id) geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ src_key = 'gene_history' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", myfile) col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment if not self.check_fileheader(col, row): pass for line in tsv: # skip comments row = line.decode().strip().split('\t') if row[0][0] == '#': continue tax_num = row[col.index('tax_id')].strip() gene_num = row[col.index('GeneID')].strip() discontinued_num = row[col.index('Discontinued_GeneID')].strip() discontinued_symbol = row[col.index('Discontinued_Symbol')].strip() # discontinued_date = row[col.index('Discontinue_Date')] # set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # end filter if gene_num == '-' or discontinued_num == '-': continue if self.test_mode and gene_num not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) # add the two genes if self.class_or_indiv.get(gene_id) == 'C': model.addClassToGraph(gene_id, None) model.addClassToGraph(discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id model.addDeprecatedClass(discontinued_gene_id, [gene_id]) else: model.addIndividualToGraph(gene_id, None) model.addIndividualToGraph( discontinued_gene_id, discontinued_symbol) model.addDeprecatedIndividual(discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene model.addSynonym(gene_id, discontinued_symbol) if not self.test_mode and (limit is not None and line_counter > limit): break
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignore element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'ORPHA:' + str(disorder_num) if self.test_mode and disorder_id not in self.all_test_ids['disease']: continue disorder_label = elem.find('Name').text # assuming that these are in the ontology (...any particular one?) model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') expected_genes = assoc_list.get('count') LOG.info( 'Expecting %s genes associated with disorder %s.', expected_genes, disorder_id) processed_genes = 0 for assoc in assoc_list.findall('DisorderGeneAssociation'): processed_genes += 1 gene = assoc.find('Gene') # get gene's curie HGNC or Ensembl ... lclid = gene.find('OrphaNumber').text gene_curie = 'ORPHA:' + lclid gene_set = {'ORPHA': lclid} for gene_ref in gene.findall( './ExternalReferenceList/ExternalReference'): gene_set[gene_ref.find('Source').text] = \ gene_ref.find('Reference').text # set priority (clique leader if available) but default to OPRHA for pfx in ('HGNC', 'Ensembl', 'SwissProt'): if pfx in gene_set: if pfx in self.localtt: pfx = self.localtt[pfx] gene_curie = pfx + ':' + gene_set[pfx] gene_set.pop(pfx) model.addClassToGraph(gene_curie, None) break # TEC have reservations w.r.t aggerator links being gene classes for prefix in gene_set: lclid = gene_set[prefix] if prefix in self.localtt: prefix = self.localtt[prefix] dbxref = prefix + ':' + lclid if gene_curie != dbxref: model.addClassToGraph(dbxref, None) model.addEquivalentClass(gene_curie, dbxref) # TEC. would prefer this not happen here. let HGNC handle it # except there are some w/o explicit external links ... gene_symbol = gene.find('Symbol').text syn_list = gene.find('./SynonymList') if int(syn_list.get('count')) > 0: for syn in syn_list.findall('./Synonym'): model.addSynonym(gene_curie, syn.text) dg_label = assoc.find('./DisorderGeneAssociationType/Name').text # use dg association status to issue an evidence code # FIXME I think that these codes are sub-optimal eco_id = self.resolve( assoc.find('DisorderGeneAssociationStatus/Name').text) rel_id = self.resolve(dg_label) g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie, disorder_id, rel_id) g2p_assoc.add_evidence(eco_id) g2p_assoc.add_association_to_graph() elem.clear() # empty the element if int(expected_genes) != processed_genes: LOG.warning( '% expected %s associated genes but we processed %i', disorder_id, expected_genes, processed_genes) if self.test_mode and limit is not None and line_counter > limit: return return
def _transform_entry(self, e, graph): g = graph model = Model(g) geno = Genotype(graph) tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' build_num = "GRCh38" build_id = "NCBIGenome:"+build_num # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": # "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": # "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, # and add it as a synonym abbrev = None if len(re.split(r';', label)) > 1: abbrev = (re.split(r';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': model.addDeprecatedClass(omimid) else: omimtype = self._get_omimtype(e['entry']) nodelabel = newlabel # this uses our cleaned-up label if omimtype == Genotype.genoparts['heritable_phenotypic_marker']: if abbrev is not None: nodelabel = abbrev # in this special case, # make it a disease by not declaring it as a gene/marker model.addClassToGraph(omimid, nodelabel, None, newlabel) elif omimtype == Genotype.genoparts['gene']: if abbrev is not None: nodelabel = abbrev model.addClassToGraph(omimid, nodelabel, omimtype, newlabel) else: model.addClassToGraph(omimid, newlabel, omimtype) # add the original screaming-caps OMIM label as a synonym model.addSynonym(omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym') # for OMIM, we're adding the description as a definition model.addDefinition(omimid, description) if abbrev is not None: model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym') # if this is a genetic locus (but not sequenced) # then add the chrom loc info # but add it to the ncbi gene identifier, # not to the omim id (we reserve the omim id to be the phenotype) feature_id = None feature_label = None if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] is_gene = False if omimtype == \ Genotype.genoparts['heritable_phenotypic_marker']: # get the ncbigene ids ncbifeature = self._get_mapped_gene_ids(e['entry'], g) if len(ncbifeature) == 1: feature_id = 'NCBIGene:'+str(ncbifeature[0]) # add this feature as a cause for the omim disease # TODO SHOULD I EVEN DO THIS HERE? assoc = G2PAssoc(g, self.name, feature_id, omimid) assoc.add_association_to_graph() elif len(ncbifeature) > 1: logger.info( "Its ambiguous when %s maps to >1 gene id: %s", omimid, str(ncbifeature)) else: # no ncbi feature, make an anonymous one feature_id = self._make_anonymous_feature(str(omimnum)) feature_label = abbrev elif omimtype == Genotype.genoparts['gene']: feature_id = omimid is_gene = True else: # 158900 falls into this category feature_id = self._make_anonymous_feature(str(omimnum)) if abbrev is not None: feature_label = abbrev omimtype = \ Genotype.genoparts[ 'heritable_phenotypic_marker'] if feature_id is not None: if 'comments' in genemap: # add a comment to this feature comment = genemap['comments'] if comment.strip() != '': model.addDescription(feature_id, comment) if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. # add this omim thing as # a subsequence of the cytofeature # 18p11.3-p11.2 # FIXME # add the other end of the range, # but not sure how to do that # not sure if saying subsequence of feature # is the right relationship f = Feature(g, feature_id, feature_label, omimtype) if 'chromosomeSymbol' in genemap: chrom_num = str(genemap['chromosomeSymbol']) chrom = makeChromID(chrom_num, tax_num, 'CHR') geno.addChromosomeClass( chrom_num, tax_id, tax_label) # add the positional information, if available fstart = fend = -1 if 'chromosomeLocationStart' in genemap: fstart = genemap['chromosomeLocationStart'] if 'chromosomeLocationEnd' in genemap: fend = genemap['chromosomeLocationEnd'] if fstart >= 0: # make the build-specific chromosome chrom_in_build = makeChromID(chrom_num, build_num, 'MONARCH') # then, add the chromosome instance # (from the given build) geno.addChromosomeInstance( chrom_num, build_id, build_num, chrom) if omimtype == \ Genotype.genoparts[ 'heritable_phenotypic_marker']: postypes = [Feature.types['FuzzyPosition']] else: postypes = None # NOTE that no strand information # is available in the API f.addFeatureStartLocation( fstart, chrom_in_build, None, postypes) if fend >= 0: f.addFeatureEndLocation( fend, chrom_in_build, None, postypes) if fstart > fend: logger.info( "start>end (%d>%d) for %s", fstart, fend, omimid) # add the cytogenic location too # for now, just take the first one cytoloc = cytoloc.split('-')[0] loc = makeChromID(cytoloc, tax_num, 'CHR') model.addClassToGraph(loc, None) f.addSubsequenceOfFeature(loc) f.addFeatureToGraph(True, None, is_gene) # end adding causative genes/features # check if moved, if so, # make it deprecated and # replaced consider class to the other thing(s) # some entries have been moved to multiple other entries and # use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search(r'and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split(r'and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) model.addDeprecatedClass(omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_mapped_gene_ids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) # temp gag return
def _process_qtls_genetic_location(self, raw, src_key, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[src_key]['curie'] common_name = common_name.strip() if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header in these files, so no header checking col = self.files[src_key]['columns'] for row in reader: if len(row) != len(self.qtl_columns): LOG.warning( "Problem parsing %s line %i containing: \n%s\n" "got %i cols but expected %i", raw, reader.line_num, row, len(row), len(col)) continue else: qtl_id = row[col.index('qtl_id')].strip() qtl_symbol = row[col.index('qtl_symbol')].strip() trait_name = row[col.index('trait_name')].strip() # assotype = row[col.index('assotype')].strip() # empty = row[col.index('empty')].strip() chromosome = row[col.index('chromosome')].strip() position_cm = row[col.index('position_cm')].strip() range_cm = row[col.index('range_cm')].strip() # flankmark_a2 = row[col.index('flankmark_a2')].strip() # flankmark_a1 = row[col.index('flankmark_a1')].strip() peak_mark = row[col.index('peak_mark')].strip() # flankmark_b1 = row[col.index('flankmark_b1')].strip() # flankmark_b2 = row[col.index('flankmark_b2')].strip() # exp_id = row[col.index('exp_id')].strip() # model_id = row[col.index('model_id')].strip() # test_base = row[col.index('test_base')].strip() # sig_level = row[col.index('sig_level')].strip() # lod_score = row[col.index('lod_score')].strip() # ls_mean = row[col.index('ls_mean')].strip() p_values = row[col.index('p_values')].strip() # f_statistics = row[col.index('f_statistics')].strip() # variance = row[col.index('variance')].strip() # bayes_value = row[col.index('bayes_value')].strip() # likelihood_ratio = row[col.index('likelihood_ratio')].strip() trait_id = row[col.index('trait_id')].strip() # dom_effect = row[col.index('dom_effect')].strip() # add_effect = row[col.index('add_effect')].strip() pubmed_id = row[col.index('pubmed_id')].strip() gene_id = row[col.index('gene_id')].strip() gene_id_src = row[col.index('gene_id_src')].strip() # gene_id_type = row[col.index('gene_id_type')].strip() # empty2 = row[col.index('empty2')].strip() if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:' + common_name + '-linkage' build_label = common_name + ' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm) ] else: LOG.info("A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation(stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:' + peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant']) gene_id = gene_id.replace('uncharacterized ', '').strip() gene_id = gene_id.strip( ',') # for "100157483," in pig_QTLdata.txt if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error(' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph( trait_id, trait_name, class_category=blv.terms['PhenotypicFeature']) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:' + pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference(graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc(graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc(graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # off by one - the following actually gives us (limit + 1) records if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with QTL genetic info") return
class Decipher(Source): """ Deprecated - please see the EBIGene2Phen class, which parses the same file but fetches it from EBI which has clearer terms for redistribution, while Decipher has restrictive terms due to containing patient data in password protected datasets. The Decipher group curates and assembles the Development Disorder Genotype Phenotype Database (DDG2P) which is a curated list of genes reported to be associated with developmental disorders, compiled by clinicians as part of the DDD study to facilitate clinical feedback of likely causal variants. Beware that the redistribution of this data is a bit unclear from the [license](https://decipher.sanger.ac.uk/legal). If you intend to distribute this data, be sure to have the appropriate licenses in place. """ files = { 'annot': { 'file': 'ddg2p.zip', 'url': 'https://decipher.sanger.ac.uk/files/ddd/ddg2p.zip'} } def __init__(self, graph_type, are_bnodes_skolemized): super().__init__( graph_type, are_bnodes_skolemized, 'decipher', ingest_title='Development Disorder Genotype Phenotype Database', ingest_url='https://decipher.sanger.ac.uk/', license_url='https://decipher.sanger.ac.uk/legal', data_rights='https://decipher.sanger.ac.uk/datasharing', # file_handle=None ) if 'disease' not in self.all_test_ids: LOG.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = self.all_test_ids['disease'] self.graph = self.graph self.geno = Genotype(self.graph) self.model = Model(self.graph) return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) # since there's a dependency on HGNC files; fetch those too hgnc = HGNC() hgnc.fetch(is_dl_forced) return def parse(self, limit=None): if limit is not None: LOG.info("Only parsing first %s rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.graph = self.testgraph else: self.graph = self.graph self.geno = Genotype(self.graph) # rare disease-phenotype associations self._process_ddg2p_annotations(limit) LOG.info("Finished parsing.") return def _process_ddg2p_annotations(self, limit): """ The ddg2p annotations associate a gene symbol to an omim disease, along with some HPO ids and pubs. The gene symbols come from gencode, which in turn come from HGNC official gene symbols. Therefore, we use the HGNC source class to get the id/symbol mapping for use in our annotations here. According to http://www.gencodegenes.org/faq.html, "Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. Sometimes, when there is no official gene symbol, the Havana clone-based name is used." The kind of variation that is linked to a disease is indicated (LOF, GOF, CNV, etc) in the source data. Here, we create an anonymous variant of the specified gene of the indicated type (mapped to the sequence ontology (SO)). :param limit: :return: """ line_counter = 0 if self.graph is not None: graph = self.graph else: graph = self.graph # in order for this to work, we need to map the HGNC id-symbol; hgnc = HGNC() hgnc_symbol_id_map = hgnc.get_symbol_id_map() myzip = ZipFile( '/'.join((self.rawdir, self.files['annot']['file'])), 'r') # use the ddg2p.txt file fname = 'ddg2p.txt' unmapped_omim_counter = 0 unmapped_gene_count = 0 with myzip.open(fname, 'r') as f: f = io.TextIOWrapper(f) reader = csv.reader(f, delimiter='\t', quotechar='\"') # score_means_by_measure = {} # strain_scores_by_measure = {} # TODO theseare unused for row in reader: line_counter += 1 if re.match(r'#', row[0]): # skip comments continue (gencode_gene_name, mode, category, consequence, disease, omim, ddg2p_id, pubmed_ids, hpo_codes) = row hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip()) if hgnc_id is None: LOG.error( "Couldn't map the gene symbol %s to HGNC.", gencode_gene_name) unmapped_gene_count += 1 continue # add the gene self.model.addClassToGraph(hgnc_id, gencode_gene_name) # TODO make VSLC with the variation # to associate with the disorder # TODO use the Inheritance and Mutation consequence # to classify the VSLCs allele_id = self.make_allele_by_consequence( consequence, hgnc_id, gencode_gene_name) if omim.strip() != '': omim_id = 'OMIM:'+str(omim.strip()) # assume this is declared elsewhere in ontology self.model.addClassToGraph(omim_id, None) # ??? rel is never used # if category.strip() == 'Confirmed DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Probable DD gene': # rel = self.self.globaltt['has phenotype'] # elif category.strip() == 'Possible DD gene': # rel = self.self.globaltt['contributes to'] # elif category.strip() == 'Not DD gene': # # TODO negative annotation # continue assoc = G2PAssoc(graph, self.name, allele_id, omim_id) # TODO 'rel' is assigned to but never used for p in re.split(r';', pubmed_ids): p = p.strip() if p != '': pmid = 'PMID:' + str(p) r = Reference( graph, pmid, self.globaltt['journal article']) r.addRefToGraph() assoc.add_source(pmid) assoc.add_association_to_graph() else: # these are unmapped to a disease id. # note that some match OMIM disease labels # but the identifiers are just not included. # TODO consider mapping to OMIM or DOIDs in other ways LOG.warning( "No omim id on line %d\n%s", line_counter, str(row)) unmapped_omim_counter += 1 # TODO hpo phenotypes # since the DDG2P file is not documented, # I don't know what the HPO annotations are actually about # are they about the gene? the omim disease? something else? # So, we wont create associations until this is clarified if not self.test_mode and limit is not None and line_counter > limit: break myzip.close() LOG.warning( "gene-disorder associations with no omim id: %d", unmapped_omim_counter) LOG.warning("unmapped gene count: %d", unmapped_gene_count) return def make_allele_by_consequence(self, consequence, gene_id, gene_symbol): """ Given a "consequence" label that describes a variation type, create an anonymous variant of the specified gene as an instance of that consequence type. :param consequence: :param gene_id: :param gene_symbol: :return: allele_id """ allele_id = None # Loss of function : Nonsense, frame-shifting indel, # essential splice site mutation, whole gene deletion or any other # mutation where functional analysis demonstrates clear reduction # or loss of function # All missense/in frame : Where all the mutations described in the data # source are either missense or in frame deletions and there is no # evidence favoring either loss-of-function, activating or # dominant negative effect # Dominant negative : Mutation within one allele of a gene that creates # a significantly greater deleterious effect on gene product # function than a monoallelic loss of function mutation # Activating : Mutation, usually missense that results in # a constitutive functional activation of the gene product # Increased gene dosage : Copy number variation that increases # the functional dosage of the gene # Cis-regulatory or promotor mutation : Mutation in cis-regulatory # elements that lies outwith the known transcription unit and # promotor of the controlled gene # Uncertain : Where the exact nature of the mutation is unclear or # not recorded type_id = self.resolve(consequence, mandatory=False) if type_id == consequence: LOG.warning("Consequence type unmapped: %s", str(consequence)) type_id = self.globaltt['sequence_variant'] # make the allele allele_id = ''.join((gene_id, type_id)) allele_id = re.sub(r':', '', allele_id) allele_id = '_:'+allele_id # make this a BNode allele_label = ' '.join((consequence, 'allele in', gene_symbol)) self.model.addIndividualToGraph(allele_id, allele_label, type_id) self.geno.addAlleleOfGene(allele_id, gene_id) return allele_id
def _add_variant_trait_association(self, variant_id, mapped_trait_uri, efo_ontology, pubmed_id, description=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for trait in re.split(r',', mapped_trait_uri): trait = trait.strip() cu = CurieUtil(curie_map.get()) trait_id = cu.get_curie(trait) dis_query = """ SELECT ?trait WHERE {{ {0} rdfs:subClassOf+ EFO:0000408 . {0} rdfs:label ?trait . }} """.format(trait_id) query_result = efo_ontology.query(dis_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_id): model.addClassToGraph(trait_id, list(query_result)[0][0], 'DOID:4') phenotype_query = """ SELECT ?trait WHERE {{ {0} rdfs:subClassOf+ EFO:0000651 . {0} rdfs:label ?trait . }} """.format(trait_id) query_result = efo_ontology.query(phenotype_query) if len(list(query_result)) > 0: if re.match(r'^EFO', trait_id): model.addClassToGraph(trait_id, list(query_result)[0][0], 'UPHENO:0001001') pubmed_curie = 'PMID:' + pubmed_id ref = Reference(g, pubmed_curie, Reference.ref_types['journal_article']) ref.addRefToGraph() assoc = G2PAssoc(g, self.name, variant_id, trait_id, model.object_properties['contributes_to']) assoc.add_source(pubmed_curie) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) if description is not None: assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) assoc.add_association_to_graph()
def _get_chrbands(self, limit, taxon, genome_id): """ :param limit: :return: """ if limit is None: limit = sys.maxsize # practical limit anyway model = Model(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label, genome_id) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands col = ['scaffold', 'start', 'stop', 'band_num', 'rtype'] with gzip.open(myfile, 'rb') as binreader: for line in binreader: line_counter += 1 # skip comments line = line.decode().strip() if line[0] == '#' or line_counter > limit: continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') scaffold = row[col.index('scaffold')] start = row[col.index('start')] stop = row[col.index('stop')] band_num = row[col.index('band_num')].strip() rtype = row[col.index('rtype')] # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' mch = re.match(placed_scaffold_pattern + r'$', scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level LOG.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: LOG.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } parents = list() if band_num is not None and band_num != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None } # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num + band_num]['stain'] = self.resolve(rtype) # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] # loop through the parents and add them to the dict # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i], self.graph) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum is not None and pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd elif pnum is not None: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom else: LOG.error("pnum is None") # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num binreader.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, taxon, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False)
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not (re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.gut = GraphUtils(self.curie_map) def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.globaltt['intrinsic genotype'] self.model.addIndividualToGraph(genotype_id, genotype_label, genotype_type, genotype_description) def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if allele_type is None: allele_type = self.globaltt['allele'] # TODO is this a good idea? self.model.addIndividualToGraph(allele_id, allele_label, allele_type, allele_description) def addGene(self, gene_id, gene_label=None, gene_type=None, gene_description=None): ''' genes are classes ''' if gene_type is None: gene_type = self.globaltt['gene'] self.model.addClassToGraph(gene_id, gene_label, gene_type, gene_description) def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None, construct_category=None, construct_type_category=None): """ :param construct_id: :param construct_label: :param construct_type: :param construct_description: :param construct_category: a biolink category CURIE for construct_id :param construct_type_category: a biolink category CURIE for construct_type :return: """ # TODO add base type for construct # if (constrcut_type is None): # construct_type=self.construct_base_type self.model.addIndividualToGraph( construct_id, construct_label, construct_type, construct_description, ind_category=construct_category, ind_type_category=construct_type_category) def addDerivesFrom(self, child_id, parent_id, child_category=None, parent_category=None): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.graph.addTriple(child_id, self.globaltt['derives_from'], parent_id, subject_category=child_category, object_category=parent_category) def addSequenceDerivesFrom(self, child_id, parent_id, child_category=None, parent_category=None): self.graph.addTriple(child_id, self.globaltt['sequence_derives_from'], parent_id, subject_category=child_category, object_category=parent_category) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_allele_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.globaltt["is_allele_of"] self.graph.addTriple(allele_id, rel_id, gene_id) def addAffectedLocus(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:has_affected_feature. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.globaltt['has_affected_feature'] self.graph.addTriple(allele_id, rel_id, gene_id) def addGeneProduct(self, sequence_id, product_id, product_label=None, product_type=None, sequence_category=None, product_category=None): """ Add gene/variant/allele has_gene_product relationship Can be used to either describe a gene to transcript relationship or gene to protein :param sequence_id: :param product_id: :param product_label: :param product_type: :param sequence_category: bl category CURIE for seq_id [blv.terms.Gene].value :param product_category: biolink category CURIE for product_id :return: """ if product_label is not None and product_type is not None: self.model.addIndividualToGraph(product_id, product_label, product_type, ind_category=product_category) self.graph.addTriple(sequence_id, self.globaltt['has gene product'], product_id, subject_category=sequence_category, object_category=product_category) def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.globaltt['polypeptide'] self.model.addIndividualToGraph(polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.graph.addTriple(transcript_id, self.globaltt['translates_to'], polypeptide_id) def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.globaltt['homozygous'] else: zygosity_id = self.globaltt['heterozygous'] if zygosity_id is not None: self.graph.addTriple(vslc_id, self.globaltt['has_zygosity'], zygosity_id) def addVSLCtoParent(self, vslc_id, parent_id, part_category=None, parent_category=None): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :param part_category: a biolink category CURIE for part :param parent_category: a biolink category CURIE for parent :return: """ self.addParts(vslc_id, parent_id, self.globaltt['has_variant_part'], part_category=part_category, parent_category=parent_category) def addParts(self, part_id, parent_id, part_relationship=None, part_category=None, parent_category=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :param part_category: a biolink vocab curie for part_id :param parent_category: a biolink vocab curie for parent_id :return: """ if part_relationship is None: part_relationship = self.globaltt['has_part'] # Fail loudly if parent or child identifiers are None if parent_id is None: raise TypeError('Attempt to pass None as parent') elif part_id is None: raise TypeError('Attempt to pass None as child') elif part_relationship is None: part_relationship = self.globaltt['has_part'] self.graph.addTriple(parent_id, part_relationship, part_id, subject_category=parent_category, object_category=part_category) def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.globaltt['sequence_alteration'] self.model.addIndividualToGraph(sa_id, sa_label, sa_type, sa_description) def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.globaltt['has_variant_part']) def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.globaltt['genomic_background'] self.model.addIndividualToGraph(background_id, background_label, background_type, background_description) def addGenomicBackgroundToGenotype(self, background_id, genotype_id, background_type=None): if background_type is None: background_type = self.globaltt['genomic_background'] self.model.addType(background_id, background_type) self.addParts(background_id, genotype_id, self.globaltt['has_reference_part']) def addTaxon(self, taxon_id, genopart_id, genopart_category=None): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :param genopart_category: a biolink term for genopart_id :return: """ self.graph.addTriple(genopart_id, self.globaltt['in taxon'], taxon_id) def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): """ Add genotype has_variant_part reagent_id. For example, add a morphant reagent thingy to the genotype, assuming it's a extrinsic_genotype Also a triple to assign biolink categories to genotype and reagent. :param reagent_id :param genotype_id :return: """ self.graph.addTriple(genotype_id, self.globaltt['has_variant_part'], reagent_id) def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None, reagent_category=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.model.addIndividualToGraph(reagent_id, reagent_label, reagent_type, description, ind_category=reagent_category) self.graph.addTriple(reagent_id, self.globaltt['targets_gene'], gene_id) def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None, reagent_category=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdfs:label targeted_gene_label dc:description description <reagent_id> GENO:targets_gene <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :param reagent_category: a biolink category CURIE for reagent_id :return: """ # akin to a variant locus # is this some sort of pseudo bnode? if targeted_gene_id is None: targeted_gene_id = '_' + gene_id + '-' + reagent_id targeted_gene_id = targeted_gene_id.replace(":", "") self.model.addIndividualToGraph(targeted_gene_id, targeted_gene_label, self.globaltt['reagent_targeted_gene'], description, ind_category=reagent_category) if gene_id is not None: self.graph.addTriple(targeted_gene_id, self.globaltt['is_expression_variant_of'], gene_id) self.graph.addTriple(targeted_gene_id, self.globaltt['is_targeted_by'], reagent_id) def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.globaltt['targeted_gene_subregion'] self.model.addIndividualToGraph(tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.graph.addTriple(population_id, self.globaltt['has_member_with_allelotype'], member_id) def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.globaltt['targeted_gene_complement'] self.model.addIndividualToGraph(tgc_id, tgc_label, tgc_type, tgc_description) def addGenome(self, taxon_num, taxon_label=None, genome_id=None): ncbitaxon = 'NCBITaxon:' + taxon_num if taxon_label is None: if ncbitaxon in self.globaltcid: taxon_label = self.globaltcid[ncbitaxon] else: logging.warning('Add ' + ncbitaxon + ' to global translation table') taxon_label = taxon_num elif ncbitaxon in self.globaltcid and taxon_label != self.globaltcid[ ncbitaxon]: logging.warning('"' + self.globaltcid[ncbitaxon] + '" may need updating from "' + taxon_label + '" in global translation table') logging.warning( '"' + taxon_label + '": " ' + self.globaltcid[ncbitaxon] + '"' + ' may need to be added to a local translation table') genome_label = taxon_label + ' genome' if genome_id is None: genome_id = self.makeGenomeID(taxon_num) self.model.addClassToGraph(genome_id, genome_label, self.globaltt['genome']) def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.model.addIndividualToGraph(build_id, build_label, self.globaltt['reference_genome'], blv.terms['GenomeBuild']) self.model.addType(build_id, genome_id, subject_category=blv.terms['GenomeBuild']) if re.match(r'[0-9]+', taxon_id): taxon_id = 'NCBITaxon:' + taxon_id self.addTaxon(taxon_id, build_id, genopart_category=blv.terms['GenomeBuild']) @staticmethod def makeGenomeID(taxon_id): # scrub off the taxon prefix. put it in base space # TODO: revisit as yet another BNODE? # should never be called if a real genome iri exists # should create the opaque bode and label together # genome_id = re.sub(r'.*\:', '_:', taxon_id) + 'genome' genome_id = '_:' + taxon_id + 'genome' return genome_id def addChromosome(self, chrom, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family(self.graph) # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chrom), tax_id) if tax_label is not None: chr_label = makeChromLabel(chrom, tax_label) else: chr_label = makeChromLabel(chrom) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph(chr_id, chr_label, self.globaltt['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chrom, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chrom, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id, group_category=blv.terms['GenomeBuild']) family.addMemberOf(chrinbuild_id, build_id, group_category=blv.terms['GenomeBuild']) def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.model.addClassToGraph(chrom_class_id, chrom_class_label, self.globaltt['chromosome']) def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph(chr_id, chr_label, self.globaltt['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id, group_category=blv.terms['GenomeBuild']) family.addMemberOf(chr_id, reference_id) @staticmethod def make_variant_locus_label(gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip() + '<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if gene_label is None and allele1_label is None and allele2_label is None: LOG.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label def make_experimental_model_with_genotype(self, genotype_id, genotype_label, taxon_id, taxon_label): animal_id = '-'.join((taxon_id, 'with', genotype_id)) animal_id = animal_id.replace(':', '') # bnode animal_id = ':'.join(('_', self.gut.digest_id(animal_id))) animal_label = ' '.join((genotype_label, taxon_label)) self.model.addIndividualToGraph(animal_id, animal_label, taxon_id) self.graph.addTriple(animal_id, self.globaltt['has_genotype'], genotype_id) return animal_id
def _process_disease2gene(self, row): """ Here, we process the disease-to-gene associations. Note that we ONLY process direct associations (not inferred through chemicals). Furthermore, we also ONLY process "marker/mechanism" associations. We preferentially utilize OMIM identifiers over MESH identifiers for disease/phenotype. Therefore, if a single OMIM id is listed under the "omim_ids" list, we will choose this over any MeSH id that might be listed as the disease_id. If multiple OMIM ids are listed in the omim_ids column, we toss this for now. (Mostly, we are not sure what to do with this information.) We also pull in the MeSH labels here (but not OMIM) to ensure that we have them (as they may not be brought in separately). :param row: :return: """ # if self.test_mode: # graph = self.testgraph # else: # graph = self.graph # self._check_list_len(row, 9) # geno = Genotype(graph) # gu = GraphUtils(curie_map.get()) model = Model(self.graph) (gene_symbol, gene_id, disease_name, disease_id, direct_evidence, inference_chemical_name, inference_score, omim_ids, pubmed_ids) = row # we only want the direct associations; skipping inferred for now if direct_evidence == '' or direct_evidence != 'marker/mechanism': return # scrub some of the associations... # it seems odd to link human genes to the following "diseases" diseases_to_scrub = [ 'MESH:D004283', # dog diseases 'MESH:D004195', # disease models, animal 'MESH:D030342', # genetic diseases, inborn 'MESH:D040181', # genetic dieases, x-linked 'MESH:D020022'] # genetic predisposition to a disease if disease_id in diseases_to_scrub: LOG.info( "Skipping association between NCBIGene:%s and %s", str(gene_id), disease_id) return intersect = list( set(['OMIM:' + str(i) for i in omim_ids.split('|')] + [disease_id]) & set(self.test_diseaseids)) if self.test_mode and ( int(gene_id) not in self.test_geneids or len(intersect) < 1): return # there are three kinds of direct evidence: # (marker/mechanism | marker/mechanism|therapeutic | therapeutic) # we are only using the "marker/mechanism" for now # TODO what does it mean for a gene to be therapeutic for disease? # a therapeutic target? gene_id = 'NCBIGene:' + gene_id preferred_disease_id = disease_id if omim_ids is not None and omim_ids != '': omim_id_list = re.split(r'\|', omim_ids) # If there is only one OMIM ID for the Disease ID # or in the omim_ids list, # use the OMIM ID preferentially over any MeSH ID. if re.match(r'OMIM:.*', disease_id): if len(omim_id_list) > 1: # the disease ID is an OMIM ID and # there is more than one OMIM entry in omim_ids. # Currently no entries satisfy this condition pass elif disease_id != ('OMIM:' + omim_ids): # the disease ID is an OMIM ID and # there is only one non-equiv OMIM entry in omim_ids # we preferentially use the disease_id here LOG.warning( "There may be alternate identifier for %s: %s", disease_id, omim_ids) # TODO: What should be done with the alternate disease IDs? else: if len(omim_id_list) == 1: # the disease ID is not an OMIM ID # and there is only one OMIM entry in omim_ids. preferred_disease_id = 'OMIM:' + omim_ids elif len(omim_id_list) > 1: # This is when the disease ID is not an OMIM ID and # there is more than one OMIM entry in omim_ids. pass model.addClassToGraph(gene_id, None) # not sure if MESH is getting added separately. # adding labels here for good measure dlabel = None if re.match(r'MESH', preferred_disease_id): dlabel = disease_name model.addClassToGraph(preferred_disease_id, dlabel) # Add the disease to gene relationship. rel_id = self.resolve(direct_evidence) refs = self._process_pubmed_ids(pubmed_ids) self._make_association(gene_id, preferred_disease_id, rel_id, refs) return
def add_orthologs_by_gene_group(self, graph, gene_ids): """ This will get orthologies between human and other vertebrate genomes based on the gene_group annotation pipeline from NCBI. More information 9can be learned here: http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/ The method for associations is described in [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/) == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/). Because these are only between human and vertebrate genomes, they will certainly miss out on very distant orthologies, and should not be considered complete. We do not run this within the NCBI parser itself; rather it is a convenience function for others parsers to call. :param graph: :param gene_ids: Gene ids to fetch the orthology :return: """ src_key = 'gene_group' LOG.info("getting gene groups") src_file = '/'.join((self.rawdir, self.files[src_key]['file'])) found_counter = 0 # because many of the orthologous groups are grouped by human gene, # we need to do this by generating two-way hash # group_id => orthologs # ortholog id => group # this will be the fastest approach, though not memory-efficient. geno = Genotype(graph) model = Model(graph) group_to_orthology = {} gene_to_group = {} gene_to_taxon = {} col = self.files[src_key]['columns'] with gzip.open(src_file, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip octothorp if not self.check_fileheader(col, row): pass for row in tsv: row = row.decode().strip().split('\t') tax_a = row[col.index('tax_id')] gene_a = row[col.index('GeneID')] rel = row[col.index('relationship')] tax_b = row[col.index('Other_tax_id')] gene_b = row[col.index('Other_GeneID')] if rel != 'Ortholog': continue if gene_a not in group_to_orthology: group_to_orthology[gene_a] = set() group_to_orthology[gene_a].add(gene_b) if gene_b not in gene_to_group: gene_to_group[gene_b] = set() gene_to_group[gene_b].add(gene_a) gene_to_taxon[gene_a] = tax_a gene_to_taxon[gene_b] = tax_b # also add the group lead as a member of the group group_to_orthology[gene_a].add(gene_a) # end loop through gene_group file LOG.debug("Finished hashing gene groups") LOG.debug("Making orthology associations") for gid in gene_ids: gene_num = re.sub(r'NCBIGene:', '', gid) group_nums = gene_to_group.get(gene_num) if group_nums is not None: for group_num in group_nums: orthologs = group_to_orthology.get(group_num) if orthologs is not None: for orth in orthologs: oid = 'NCBIGene:' + str(orth) model.addClassToGraph(oid, None, self.globaltt['gene']) otaxid = 'NCBITaxon:' + str(gene_to_taxon[orth]) geno.addTaxon(otaxid, oid) assoc = OrthologyAssoc(graph, self.name, gid, oid) assoc.add_source('PMID:24063302') assoc.add_association_to_graph() # todo get gene label for orthologs - # this could get expensive found_counter += 1 # finish loop through annotated genes LOG.info( "Made %d orthology relationships for %d genes", found_counter, len(gene_ids))
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance( cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance( str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name model.addIndividualToGraph( vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info( "No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match( r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub( m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub( r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub( r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub( r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc( g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph src_key = 'genes' geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files[src_key]['file'])) col = self.files[src_key]['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(reader) if not self.check_fileheader(col, row): pass for row in reader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() status = row[col.index('status')].strip() # 41622 Approved & 1752 Entry Withdrawn location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index( 'pubmed_id')].strip() # pipe separated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe separated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if status != 'Approved': self.withdrawn[hgnc_id] = symbol continue if (self.test_mode and entrez_id != '' and entrez_id not in self.gene_ids): continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) elif symbol[ -1] == '@': # 10) region (HOX), RNA cluster, gene (PCDH) continue else: gene_type_id = self.resolve(locus_type, mandatory=False) if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple('PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and chr_match.groups(): chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and band_match.groups(): band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ reader.line_num > limit: break