def add_gene_family_to_graph(self, family_id): """ Make an association between a group of genes and some grouping class. We make the assumption that the genes in the association are part of the supplied family_id, and that the genes have already been declared as classes elsewhere. The family_id is added as an individual of type DATA:gene_family. Triples: <family_id> a EDAM-DATA:gene_family <family_id> RO:has_member <gene1> <family_id> RO:has_member <gene2> :param family_id: :param g: the graph to modify :return: """ family = Family(self.graph) gene_family = self.globaltt['gene_family'] # make the assumption that the genes # have already been added as classes previously self.model.addIndividualToGraph(family_id, None, gene_family) # add each gene to the family family.addMember(family_id, self.sub) family.addMember(family_id, self.obj) return
def addChromosomeInstance( self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph( chr_id, chr_label, Feature.types['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) return
def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph(chr_id, chr_label, self.globaltt['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) # usage dependent, todo: ommit return
def addChromosome( self, chr, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family() # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chr), tax_id) if tax_label is not None: chr_label = makeChromLabel(chr, tax_label) else: chr_label = makeChromLabel(chr) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph( chr_id, chr_label, Feature.types['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chr, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chr, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph( chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id) family.addMemberOf(chrinbuild_id, build_id) return
def addChromosome(self, chrom, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family(self.graph) # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chrom), tax_id) if tax_label is not None: chr_label = makeChromLabel(chrom, tax_label) else: chr_label = makeChromLabel(chrom) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph(chr_id, chr_label, self.globaltt['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chrom, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chrom, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id, group_category=blv.terms['GenomeBuild']) family.addMemberOf(chrinbuild_id, build_id, group_category=blv.terms['GenomeBuild'])
def _process_data(self, source, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[source]['file'])) LOG.info("Processing Data from %s", raw) if self.testMode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[source]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph(family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': vl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for d in omim_num.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if d not in omim_map: disease_id = 'OMIM:' + d.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', d) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:' + s.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.testMode and (limit is not None and line_counter > limit): break return
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ LOG.info("Processing genes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 family = Family(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels # that are delimited, as: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited # (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. # we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split('r;', gene_name) symbollist = re.split(r',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.test_mode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() model.addDefinition(gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): model.addSynonym(gene_id, i[1].strip()) if len(gene_stuff) > 2: ko_part = gene_stuff[2] ko_match = re.search(r'K\d+', ko_part) if ko_match is not None and len(ko_match.groups()) == 1: ko = 'KEGG-ko:'+ko_match.group(1) family.addMemberOf(gene_id, ko) if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with genes") return
def _process_data(self, src_key, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Data from %s", raw) if self.test_mode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[src_key]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning( 'Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.test_mode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning( 'Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join(( patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join(( patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph( cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph( equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph( family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:'+re.sub( 'MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature( graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts( karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': varl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((varl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = varl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = varl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple( patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for disease in omim_num.split(';'): if disease is not None and disease != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if disease not in omim_map: disease_id = 'OMIM:' + disease.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple( cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', disease) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for pmid in pubmed_ids.split(';'): pubmed_id = 'PMID:' + pmid.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple( pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.test_mode and ( limit is not None and line_counter > limit): break return
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ LOG.info("Processing genes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) family = Family(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels # that are delimited, as: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited # (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. # we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split('r;', gene_name) symbollist = re.split(r',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.test_mode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() model.addDefinition(gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): model.addSynonym(gene_id, i[1].strip()) if len(gene_stuff) > 2: ko_part = gene_stuff[2] ko_match = re.search(r'K\d+', ko_part) if ko_match is not None and len(ko_match.groups()) == 1: ko = 'KEGG-ko:'+ko_match.group(1) family.addMemberOf(gene_id, ko) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with genes")
def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ model = Model(self.graph) family = Family(self.graph) line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) model.addOWLPropertyClassRestriction( genome_id, Genotype.object_properties['in_taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern + r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) model.addOWLPropertyClassRestriction( cclassid, family.object_properties['member_of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid + band maplocclass_label = makeChromLabel(chrom + band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) model.addClassToGraph(maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = Feature.types['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ Feature.types['chromosome_band'], Feature.types['chromosome_subband'] ]: stain_type = Feature.types.get(rtype) if stain_type is not None: model.addOWLPropertyClassRestriction( maplocclass_id, Feature.properties['has_staining_intensity'], Feature.types.get(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): pclassid = cclassid + parents[i] # class chr parts pclass_label = \ makeChromLabel(chrom+parents[i], genome_label) rti = getChrPartTypeByNotation(parents[i]) model.addClassToGraph(pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid + parents[i + 1] # the instance model.addOWLPropertyClassRestriction( pclassid, Feature.object_properties['is_subsequence_of'], pid) model.addOWLPropertyClassRestriction( pid, Feature.object_properties['has_subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome model.addOWLPropertyClassRestriction( pclassid, Feature.object_properties['is_subsequence_of'], cclassid) model.addOWLPropertyClassRestriction( cclassid, Feature.object_properties['has_subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: model.addOWLPropertyClassRestriction( maplocclass_id, Feature.object_properties['is_subsequence_of'], cclassid + parents[0]) model.addOWLPropertyClassRestriction( cclassid + parents[0], Feature.object_properties['has_subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break # TODO figure out the staining intensities for the encompassing bands return