def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index('marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index('phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index('allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index('strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index('pipeline_stable_id')].strip() procedure_stable_id = row[col.index('procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index('parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index('statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender # sometimes phenotype ids are missing. (about 711 early 2020) if mp_term_id is None or mp_term_id == '': LOG.warning( "No phenotype id specified for row %d", reader.line_num) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, mp_term_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning( "Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:'+allele_num gene_id = 'WormBase:'+gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) geno.addReagentTargetedGene( rnai_id, 'WormBase:'+gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_'+'-'.join((gene_num, allele_num)) if self.nobnodes: vl_id = ':'+vl_id geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) r = Reference(ref) if re.search(r'Person', ref): r.setType(r.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') r.addRefToGraph(g) assoc.add_source(ref) assoc.add_association_to_graph(g) # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_data(self, source, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[source]['file'])) LOG.info("Processing Data from %s", raw) if self.testMode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[source]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph(family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': vl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for d in omim_num.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if d not in omim_map: disease_id = 'OMIM:' + d.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', d) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:' + s.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.testMode and (limit is not None and line_counter > limit): break return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning("Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:' + allele_num gene_id = 'WormBase:' + gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num) geno.addReagentTargetedGene( rnai_id, 'WormBase:' + gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_:' + '-'.join((gene_num, allele_num)) geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(g, self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) reference = Reference(g, ref) if re.search(r'Person', ref): reference.setType( reference.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') reference.addRefToGraph() assoc.add_source(ref) assoc.add_association_to_graph() # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return
def _parse_patient_variants(self, file): """ :param file: file handler :return: """ patient_var_map = self._convert_variant_file_to_dict(file) gene_coordinate_map = self._parse_gene_coordinates( self.map_files['gene_coord_map']) rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map']) genotype = Genotype(self.graph) model = Model(self.graph) self._add_variant_gene_relationship(patient_var_map, gene_coordinate_map) for patient in patient_var_map: patient_curie = 'MONARCH:{0}'.format(patient) # make intrinsic genotype for each patient intrinsic_geno_bnode = self.make_id( "{0}-intrinsic-genotype".format(patient), "_") genotype_label = "{0} genotype".format(patient) genotype.addGenotype(intrinsic_geno_bnode, genotype_label, model.globaltt['intrinsic genotype']) self.graph.addTriple(patient_curie, model.globaltt['has_genotype'], intrinsic_geno_bnode) for variant_id, variant in patient_var_map[patient].items(): build = variant['build'] chromosome = variant['chromosome'] position = variant['position'] reference_allele = variant['reference_allele'] variant_allele = variant['variant_allele'] genes_of_interest = variant['genes_of_interest'] rs_id = variant['rs_id'] variant_label = '' variant_bnode = self.make_id("{0}".format(variant_id), "_") # maybe should have these look like the elif statements below if position and reference_allele and variant_allele: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif not position and reference_allele and variant_allele \ and len(genes_of_interest) == 1: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif position and (not reference_allele or not variant_allele) \ and len(genes_of_interest) == 1: variant_label = "{0}{1}({2}):g.{3}".format( build, chromosome, genes_of_interest[0], position) elif len(genes_of_interest) == 1: variant_label = 'variant of interest in {0} gene of patient' \ ' {1}'.format(genes_of_interest[0], patient) else: variant_label = 'variant of interest in patient {0}'.format( patient) genotype.addSequenceAlteration(variant_bnode, None) # check if it we have built the label # in _add_variant_gene_relationship() labels = self.graph.objects( BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label']) label_list = list(labels) if len(label_list) == 0: model.addLabel(variant_bnode, variant_label) self.graph.addTriple(variant_bnode, self.globaltt['in taxon'], self.globaltt['H**o sapiens']) self.graph.addTriple(intrinsic_geno_bnode, self.globaltt['has_variant_part'], variant_bnode) if rs_id: dbsnp_curie = 'dbSNP:{0}'.format(rs_id) model.addSameIndividual(variant_bnode, dbsnp_curie) self._add_variant_sameas_relationships(patient_var_map, rs_map) return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ src_key = 'allele_pheno' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) col = self.files[src_key]['columns'] graph = self.graph model = Model(self.graph) LOG.info("Processing Allele phenotype associations") geno = Genotype(graph) with open(raw, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(reader) if row[0] != '!gaf-version: 2.0': LOG.error('Not a vlaid gaf v2.0 formatted file: %s', raw) # raise for row in reader: if row[0][0] == '!': continue # db = row[col.index('DB')] gene_num = row[col.index('DB Object ID')] # gene_symbol = row[col.index('DB Object Symbol')] is_not = row[col.index('Qualifier')] phenotype_id = row[col.index('GO ID')] ref = row[col.index('DB:Reference (|DB:Reference)')].strip() eco_symbol = row[col.index('Evidence Code')] with_or_from = row[col.index('With (or) From')] # aspect = row[col.index('Aspect')] # gene_name = row[col.index('DB Object Name')] # gene_synonym = row[col.index('DB Object Synonym (|Synonym)')] # gene_class = row[col.index('DB Object Type')] # taxon = row[col.index('Taxon(|taxon)')] # date = row[col.index('Date')] # assigned_by = row[col.index('Assigned By')] # blank = row[col.index('Annotation Extension')] # blank2 = row[col.index('Gene Product Form ID')] # TODO add NOT phenotypes if is_not == 'NOT': continue eco_symbol = eco_symbol.strip() eco_curie = None if eco_symbol.strip() != '' and eco_symbol in self.gaf_eco: eco_curie = self.gaf_eco[eco_symbol] else: LOG.warning( 'Evidence code %s is not found in the (gaf) gaf_eco', eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: LOG.error( "Missing alleles from phenotype assoc at line %d", reader.line_num) continue else: for allele in allele_list: allele_num = re.sub(r'WB:', '', allele.strip()) allele_id = 'WormBase:' + allele_num gene_id = 'WormBase:' + gene_num if re.search(r'WBRNAi', allele_id): # @kshefchek - removing this blank node # in favor of simpler modeling # make the WormBase:WBRNAi* id # a self.globaltt['reagent_targeted_gene'], and attach # phenotype to this ID # Previous model - make a bnode reagent-targeted gene, # & annotate that instead of the RNAi item directly # rnai_num = re.sub(r'WormBase:', '', allele_id) # rnai_id = allele_id # rtg_id = self.make_reagent_targeted_gene_id( # gene_num, rnai_num) # geno.addReagentTargetedGene( # rnai_id, 'WormBase:' + gene_num, rtg_id) # allele_id = rtg_id # Could type the IRI as both the reagant and reagant # targeted gene but not sure if this needed # geno.addGeneTargetingReagent( # allele_id, None, self.globaltt['RNAi_reagent'], gene_id) model.addIndividualToGraph( allele_id, None, self.globaltt['reagent_targeted_gene']) self.graph.addTriple( allele_id, self.globaltt['is_expression_variant_of'], gene_id) elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere # @kshefchek - removing this blank node # in favor of simpler modeling, treat variant # like an allele # vl_id = '_:'+'-'.join((gene_num, allele_num)) # geno.addSequenceAlterationToVariantLocus( # allele_id, vl_id) # geno.addAlleleOfGene(vl_id, gene_id) geno.addSequenceAlteration(allele_id, None) geno.addAlleleOfGene(allele_id, gene_id) else: LOG.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id) if eco_curie is not None: assoc.add_evidence(eco_curie) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) reference = Reference(graph, ref) if re.search(r'Person', ref): reference.setType(self.globaltt['person']) assoc.add_evidence(self.globaltt[ 'inference from background scientific knowledge'] ) reference.addRefToGraph() assoc.add_source(ref) assoc.add_association_to_graph() # finish looping through all alleles if limit is not None and reader.line_num > limit: break
def _process_data(self, raw, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) du = DipperUtil() gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (catalog_id, description, omim_number, sample_type, cell_line_available, dna_in_stock, dna_ref, gender, age, race, ethnicity, affected, karyotype, relprob, mutation, gene, family_id, collection, url, cat_remark, pubmed_ids, family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,, # 2,,18343,H**o sapiens if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:'+catalog_id.strip() # Map the cell/sample type cell_type = self._map_cell_type(sample_type) # Make a cell line label line_label = \ collection.partition(' ')[0]+'-'+catalog_id.strip() # Map the repository/collection repository = self._map_collection(collection) # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_person' if self.nobnodes: patient_id = ':'+patient_id if family_id != '': patient_id = \ '-'.join((patient_id, family_id, family_member)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id.strip())) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. short_desc = (description.split(';')[0]).capitalize() if affected == 'Yes': affected = 'affected' elif affected == 'No': affected = 'unaffected' gender = gender.lower() patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = \ ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = \ ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = 'CLO:0000031' gu.addIndividualToGraph( g, cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:'+dna_ref # some of the equivalent ids are not defined # in the source data; so add them gu.addIndividualToGraph( g, equiv_cell_line, None, cell_line_reagent_id) gu.addSameIndividual(g, cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository gu.addMember(g, repository, cell_line_id) if cat_remark != '': gu.addDescription(g, cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # g,age_id,age,self.terms['age']) # gu.addTriple( # g,age_id,self.properties['has_measurement'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. gu.addPerson(g, patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self._map_race(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.terms['race'],mapped_race) # gu.addSubclass( # g,self.terms['ethnic_group'],mapped_race) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if family_id != '': family_comp_id = 'CoriellFamily:'+family_id family_label = \ ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual gu.addIndividualToGraph( g, family_comp_id, family_label, geno.genoparts['family']) # Add the patient as a member of the family gu.addMemberOf(g, patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! if species is None or species == '': species = 'H**o sapiens' taxon = self._map_species(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None if dbsnp_id != '': genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip() omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = du.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = \ '_'+re.sub('MONARCH:', '', self.make_id(karyotype)) if self.nobnodes: karyotype_id = ':'+karyotype_id # add karyotype as karyotype_variation_complement gu.addIndividualToGraph( g, karyotype_id, karyotype, geno.genoparts['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = \ self._get_affected_chromosomes_from_karyotype( karyotype) for c in karyo_chrs: chr_id = makeChromID(c, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, c)) karyotype_feature_label = \ 'some karyotype alteration on chr'+str(c) f = Feature( karyotype_feature_id, karyotype_feature_label, geno.genoparts['sequence_alteration']) f.addFeatureStartLocation(None, chr_id) f.addFeatureToGraph(g) f.loadAllProperties(g) geno.addParts( karyotype_feature_id, karyotype_id, geno.object_properties['has_alternate_part']) if gene != '': vl = gene+'('+mutation+')' # fix the variant_id so it's always in the same order vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' \ and not self._is_normal_karyotype(karyotype): mutation = mutation.strip() gvc_id = karyotype_id if variant_id != '': gvc_id = '_' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass if gvc_id is not None and gvc_id != karyotype_id \ and self.nobnodes: gvc_id = ':'+gvc_id # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = geno.object_properties['has_alternate_part'] if self._is_normal_karyotype(karyotype): karyo_rel = \ geno.object_properties['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for v in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X m = re.match(r'(\d+)\.+(.*)', v.strip()) if m is not None and len(m.groups()) == 2: (locus_num, var_num) = m.groups() if locus_num is not None \ and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for o in omim_map: # gene_id = 'OMIM:' + o # TODO unused vslc_id = \ '_' + '-'.join( [o + '.' + a for a in omim_map.get(o)]) if self.nobnodes: vslc_id = ':'+vslc_id vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts[ 'variant_single_locus_complement']) for v in omim_map.get(o): # this is actually a sequence alt allele1_id = 'OMIM:'+o+'.'+v geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, geno.zygosity['indeterminate'], geno.object_properties[ 'has_alternate_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype gu.addType(g, patient_id, geno.genoparts['wildtype']) elif genotype_id is None: # make an anonymous genotype id genotype_id = '_geno'+catalog_id.strip() if self.nobnodes: genotype_id = ':'+genotype_id # add the gvc if gvc_id is not None: gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = \ geno.object_properties[ 'has_reference_part'] else: rel = \ geno.object_properties[ 'has_alternate_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = \ '; '.join((gvc_label, karyotype)) else: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, geno.object_properties[ 'has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, geno.genoparts['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient gu.addTriple( g, patient_id, geno.properties['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # ############# DEAL WITH THE DISEASES ############# # we associate the disease to the patient if affected == 'affected': if omim_number != '': for d in omim_number.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno if d not in omim_map: disease_id = 'OMIM:'+d.strip() # assume the label is taken care of gu.addClassToGraph(g, disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( self.name, patient_id, disease_id) assoc.add_association_to_graph(g) # this line is a model of this disease # TODO abstract out model into # it's own association class? gu.addTriple( g, cell_line_id, gu.properties['model_of'], disease_id) else: logger.info( 'removing %s from disease list ' + 'since it is a gene', d) # ############# ADD PUBLICATIONS ############# if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:'+s.strip() ref = Reference(pubmed_id) ref.setType(Reference.ref_types['journal_article']) ref.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.properties['mentions'], cell_line_id) if not self.testMode \ and (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) return
def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) graph = self.graph model = Model(self.graph) LOG.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(graph) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row # TODO add NOT phenotypes if is_not == 'NOT': continue eco_symbol = eco_symbol.strip() eco_id = None if eco_symbol.strip() != '': eco_id = self.resolve(eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: LOG.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for allele in allele_list: allele_num = re.sub(r'WB:', '', allele.strip()) allele_id = 'WormBase:' + allele_num gene_id = 'WormBase:' + gene_num if re.search(r'WBRNAi', allele_id): # @kshefchek - removing this blank node # in favor of simpler modeling # make the WormBase:WBRNAi* id # a self.globaltt['reagent_targeted_gene'], and attach # phenotype to this ID # Previous model - make a bnode reagent-targeted gene, # & annotate that instead of the RNAi item directly #rnai_num = re.sub(r'WormBase:', '', allele_id) #rnai_id = allele_id #rtg_id = self.make_reagent_targeted_gene_id( # gene_num, rnai_num) #geno.addReagentTargetedGene( # rnai_id, 'WormBase:' + gene_num, rtg_id) # allele_id = rtg_id # Could type the IRI as both the reagant and reagant # targeted gene but not sure if this needed # geno.addGeneTargetingReagent( # allele_id, None, self.globaltt['RNAi_reagent'], gene_id) model.addIndividualToGraph( allele_id, None, self.globaltt['reagent_targeted_gene']) self.graph.addTriple( allele_id, self.globaltt['is_expression_variant_of'], gene_id) elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere # @kshefchek - removing this blank node # in favor of simpler modeling, treat variant # like an allele #vl_id = '_:'+'-'.join((gene_num, allele_num)) #geno.addSequenceAlterationToVariantLocus( # allele_id, vl_id) #geno.addAlleleOfGene(vl_id, gene_id) geno.addSequenceAlteration(allele_id, None) geno.addAlleleOfGene(allele_id, gene_id) else: LOG.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) reference = Reference(graph, ref) if re.search(r'Person', ref): reference.setType(self.globaltt['person']) assoc.add_evidence( self.globaltt[ 'inference from background scientific knowledge' ]) reference.addRefToGraph() assoc.add_source(ref) assoc.add_association_to_graph() # finish looping through all alleles if limit is not None and line_counter > limit: break return
def _parse_patient_variants(self, file): """ :param file: file handler :return: """ patient_var_map = self._convert_variant_file_to_dict(file) gene_coordinate_map = self._parse_gene_coordinates( self.map_files['gene_coord_map']) rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map']) genotype = Genotype(self.graph) model = Model(self.graph) self._add_variant_gene_relationship(patient_var_map, gene_coordinate_map) for patient in patient_var_map: patient_curie = ':{0}'.format(patient) # make intrinsic genotype for each patient intrinsic_geno_bnode = self.make_id( "{0}-intrinsic-genotype".format(patient), "_") genotype_label = "{0} genotype".format(patient) genotype.addGenotype( intrinsic_geno_bnode, genotype_label, model.globaltt['intrinsic_genotype']) self.graph.addTriple( patient_curie, model.globaltt['has_genotype'], intrinsic_geno_bnode) for variant_id, variant in patient_var_map[patient].items(): build = variant['build'] chromosome = variant['chromosome'] position = variant['position'] reference_allele = variant['reference_allele'] variant_allele = variant['variant_allele'] genes_of_interest = variant['genes_of_interest'] rs_id = variant['rs_id'] variant_label = '' variant_bnode = self.make_id("{0}".format(variant_id), "_") # maybe should have these look like the elif statements below if position and reference_allele and variant_allele: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif not position and reference_allele and variant_allele \ and len(genes_of_interest) == 1: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif position and (not reference_allele or not variant_allele) \ and len(genes_of_interest) == 1: variant_label = "{0}{1}({2}):g.{3}".format( build, chromosome, genes_of_interest[0], position) elif len(genes_of_interest) == 1: variant_label = 'variant of interest in {0} gene of patient' \ ' {1}'.format(genes_of_interest[0], patient) else: variant_label = 'variant of interest in patient {0}'.format(patient) genotype.addSequenceAlteration(variant_bnode, None) # check if it we have built the label # in _add_variant_gene_relationship() labels = self.graph.objects( BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label']) label_list = list(labels) if len(label_list) == 0: model.addLabel(variant_bnode, variant_label) self.graph.addTriple( variant_bnode, self.globaltt['in taxon'], self.globaltt['H**o sapiens']) self.graph.addTriple( intrinsic_geno_bnode, self.globaltt['has_variant_part'], variant_bnode) if rs_id: dbsnp_curie = 'dbSNP:{0}'.format(rs_id) model.addSameIndividual(variant_bnode, dbsnp_curie) self._add_variant_sameas_relationships(patient_var_map, rs_map) return
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) line_counter = 0 gu.loadAllProperties(g) gu.loadObjectProperties(g, geno.object_properties) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus gu.addClassToGraph(g, taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_'+re.sub(r'\W+', '_', colony) if self.nobnodes: colony_id = ':'+colony_id if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_IMPC-'+re.sub(r':', '', allele_accession_id) if self.nobnodes: allele_accession_id = ':'+allele_accession_id if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_'+strain_accession_id if self.nobnodes: strain_accession_id = ':'+strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning( "Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_seqalt'+re.sub(r':', '', allele_accession_id) if self.nobnodes: sequence_alteration_id = ':'+sequence_alteration_id geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_'+allele_accession_id+geno.zygosity['indeterminate'] vslc_colony = re.sub(r':', '', vslc_colony) if self.nobnodes: vslc_colony = ':'+vslc_colony vslc_colony_label = allele_symbol+'/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) gu.addTriple( g, colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '_' + '-'.join((marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':'+vslc_id gu.addIndividualToGraph( g, vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc gu.addType( g, vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '/' + phenotyping_center pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_'+pheno_center_strain_id if self.nobnodes: pheno_center_strain_id = ':'+pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(pheno_center_strain_id, taxon_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) genotype_name += '['+colony+']' geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name+' ('+sex+')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning( "No phenotype id specified for row %d: %s", line_counter, str(row)) continue # experimental_phenotypic_evidence This was used in ZFIN eco_id = "ECO:0000059" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # add a free-text description description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) gu.addDescription(g, assoc_id, description) # TODO add provenance information # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) return
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) line_counter = 0 impc_map = self.open_and_parse_yaml(self.map_files['impc_map']) impress_map = json.loads( self.fetch_from_url( self.map_files['impress_map']).read().decode('utf-8')) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony) if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_:IMPC-'+re.sub(r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info("Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:' + strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning("Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_:seqalt'+re.sub(r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' model.addIndividualToGraph(colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_:'+re.sub(r':', '', allele_accession_id+geno.zygosity['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) g.addTriple(colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id model.addIndividualToGraph( vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType( vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype(genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '-' + phenotyping_center + '-' + colony pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype(sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts(genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning("No phenotype id specified for row %d: %s", line_counter, str(row)) continue # hard coded ECO code eco_id = "ECO:0000015" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(g, self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # add a free-text description try: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = \ self._add_study_provenance( impc_map, impress_map, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = \ self._add_evidence( assoc_id, eco_id, impc_map, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode, impc_map) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break return