def _add_snp_to_graph(self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'], 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = str( risk_allele_frequency) + ' [risk allele frequency]' feat = Feature(graph, snp_id, snp_label.strip(), self.globaltt['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': feat.addFeatureStartLocation(chrom_pos, chrom_id) feat.addFeatureEndLocation(chrom_pos, chrom_id) feat.addFeatureToGraph() feat.addTaxonToFeature(self.globaltt['H**o sapiens']) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for ctx in re.split(r';', context): ctx = ctx.strip() cid = self.resolve(ctx, False) if cid != ctx: model.addType(snp_id, cid) return
def _add_snp_to_graph( self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): # constants tax_id = 'NCBITaxon:9606' genome_version = 'GRCh38' if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None \ and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, genome_version, 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' f = Feature( g, snp_id, snp_label.strip(), Feature.types['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': f.addFeatureStartLocation(chrom_pos, chrom_id) f.addFeatureEndLocation(chrom_pos, chrom_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for c in re.split(r';', context): cid = self._map_variant_type(c.strip()) if cid is not None: model.addType(snp_id, cid) return
def _add_snp_to_graph( self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'], 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = str(risk_allele_frequency) + ' [risk allele frequency]' feat = Feature( graph, snp_id, snp_label.strip(), self.globaltt['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': feat.addFeatureStartLocation(chrom_pos, chrom_id) feat.addFeatureEndLocation(chrom_pos, chrom_id) feat.addFeatureToGraph() feat.addTaxonToFeature(self.globaltt['H**o sapiens']) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for ctx in re.split(r';', context): ctx = ctx.strip() cid = self.resolve(ctx, False) if cid != ctx: model.addType(snp_id, cid) return
def _add_feature_with_coords(self, feature_id, start_pos, end_pos, reference, region_id): """ :param feature_id: URIRef or Curie - instance of faldo:Position :param feature_label: String :param feature_type: Object Property :param start_pos: int, starting coordinate :param end_pos: int, ending coordinate :param reference: URIRef or Curie - reference Node (gene, transcript, genome) :return: None """ add_region = True feature = Feature(feature_id, None, None) feature.addFeatureStartLocation(start_pos, reference) feature.addFeatureEndLocation(end_pos, reference) feature.addFeatureToGraph(self.graph, add_region, region_id) return
def process_catalog(self, limit=None): """ :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['catalog']['file'])) logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) tax_id = 'NCBITaxon:9606' # hardcode genome_version = 'GRCh38' # hardcode # build a hashmap of genomic location to identifiers, # to try to get the equivalences loc_to_id_hash = {} with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (date_added_to_catalog, pubmed_num, first_author, pub_date, journal, link, study_name, disease_or_trait, initial_sample_description, replicate_sample_description, region, chrom_num, chrom_pos, reported_gene_nums, mapped_gene, upstream_gene_num, downstream_gene_num, snp_gene_nums, upstream_gene_distance, downstream_gene_distance, strongest_snp_risk_allele, snps, merged, snp_id_current, context, intergenic_flag, risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text, or_or_beta, confidence_interval_95, platform_with_snps_passing_qc, cnv_flag, mapped_trait, mapped_trait_uri) = row intersect = \ list(set([str(i) for i in self.test_ids['gene']]) & set(re.split(r',', snp_gene_nums))) # skip if no matches found in test set if self.testMode and len(intersect) == 0: continue # 06-May-2015 25917933 Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933 # A genome-wide association study of suicide severity scores in bipolar disorder. # Suicide in bipolar disorder # 959 European ancestry individuals NA # 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7 # rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970 if chrom_num != '' and chrom_pos != '': loc = 'chr'+str(chrom_num)+':'+str(chrom_pos) if loc not in loc_to_id_hash: loc_to_id_hash[loc] = set() else: loc = None if re.search(r' x ', strongest_snp_risk_allele) \ or re.search(r',', strongest_snp_risk_allele): # TODO deal with haplotypes logger.warning( "We can't deal with haplotypes yet: %s", strongest_snp_risk_allele) continue elif re.match(r'rs', strongest_snp_risk_allele): rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip() # remove the alteration elif re.match(r'kgp', strongest_snp_risk_allele): # FIXME this isn't correct rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip() # http://www.1000genomes.org/faq/what-are-kgp-identifiers # for some information # They were created by Illumina for their genotyping # platform before some variants identified during the # pilot phase of the project had been assigned # rs numbers. elif re.match(r'chr', strongest_snp_risk_allele): # like: chr10:106180121-G rs_id = ':gwas-' + \ re.sub( r':', '-', strongest_snp_risk_allele.strip()) elif strongest_snp_risk_allele.strip() == '': # logger.debug( # "No strongest SNP risk allele for %s:\n%s", # pubmed_num, str(row)) # FIXME still consider adding in the EFO terms # for what the study measured? continue else: logger.warning( "There's a snp id i can't manage: %s", strongest_snp_risk_allele) continue alteration = re.search(r'-(.*)$', rs_id) if alteration is not None \ and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO rs_id = re.sub(r'-.*$', '', rs_id).strip() if loc is not None: loc_to_id_hash[loc].add(rs_id) pubmed_id = 'PMID:'+pubmed_num r = Reference( pubmed_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) # create the chromosome chrom_id = makeChromID(chrom_num, genome_version, 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency != '' and \ risk_allele_frequency != 'NR': snp_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' f = Feature( rs_id, strongest_snp_risk_allele.strip(), Feature.types[r'SNP'], snp_description) if chrom_num != '' and chrom_pos != '': f.addFeatureStartLocation(chrom_pos, chrom_id) f.addFeatureEndLocation(chrom_pos, chrom_id) f.addFeatureToGraph(g) f.addTaxonToFeature(g, tax_id) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for c in re.split(r';', context): cid = self._map_variant_type(c.strip()) if cid is not None: gu.addType(g, rs_id, cid) # add deprecation information if merged == 1 and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' if loc is not None: loc_to_id_hash[loc].append(current_rs_id) current_rs_id += str(snp_id_current) gu.addDeprecatedIndividual(g, rs_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? gu.makeLeader(g, current_rs_id) else: gu.makeLeader(g, rs_id) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for s in re.split(r',', snp_gene_nums): s = s.strip() # still have to test for this, # because sometimes there's a leading comma if s != '': gene_id = 'NCBIGene:'+s geno.addAlleleOfGene(rs_id, gene_id) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'NCBIGene:'+downstream_gene_num gu.addTriple( g, rs_id, Feature.object_properties[ r'upstream_of_sequence_of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'NCBIGene:'+upstream_gene_num gu.addTriple( g, rs_id, Feature.object_properties[ 'downstream_of_sequence_of'], upstream_gene_id) description = 'A study of ' + disease_or_trait + \ ' in ' + initial_sample_description if replicate_sample_description != '': description = \ ' '.join( (description, 'with', replicate_sample_description)) if platform_with_snps_passing_qc != '': description = ' '.join( (description, 'on platform', platform_with_snps_passing_qc)) description = ' '.join((description, '(p='+pvalue+')')) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for t in re.split(r',', mapped_trait_uri): t = t.strip() cu = CurieUtil(curie_map.get()) tid = cu.get_curie(t) assoc = G2PAssoc( self.name, rs_id, tid, gu.object_properties['contributes_to']) assoc.add_source(pubmed_id) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) # assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) assoc.add_association_to_graph(g) if not self.testMode and\ (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) # loop through the location hash, # and make all snps at that location equivalent for l in loc_to_id_hash: snp_ids = loc_to_id_hash[l] if len(snp_ids) > 1: logger.info("%s has >1 snp id: %s", l, str(snp_ids)) return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance( cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance( str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name model.addIndividualToGraph( vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info( "No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match( r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub( m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub( r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub( r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub( r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc( g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _process_QTLs_genomic_location( self, raw, taxon_id, build_id, build_label, limit=None): """ This method Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 geno = Genotype(g) # assume that chrs get added to the genome elsewhere # genome_id = geno.makeGenomeID(taxon_id) # TODO unused eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info("Processing QTL locations for %s", taxon_id) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") # bad_attr_flag = False # TODO unused for row in reader: line_counter += 1 if re.match(r'^#', ' '.join(row)): continue (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row # Chr.Z Animal QTLdb Production_QTL 33954873 34023581 . . . # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01" # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for a in attr_items: if not re.search(r'=', a): # bad_attr_flag = True # TODO unused # remove this attribute from the list bad_attrs.add(a) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.testMode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait qtl_id = 'AQTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, geno.genoparts['QTL']) geno.addTaxon(taxon_id, qtl_id) trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(g, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( g, pub_id, Reference.ref_types['journal_article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( g, self.name, qtl_id, trait_id, model.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): s = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in s: s = re.sub(r',', '.', s) if s.isnumeric(): score = float(s) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_id, 'CHR') chrom_in_build_id = \ makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(g, qtl_id, None, geno.genoparts['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_id) qtl_feature.addFeatureToGraph() if not self.testMode and \ limit is not None and line_counter > limit: break logger.warning("Bad attribute flags in this file") logger.info("Done with QTL genomic mappings for %s", taxon_id) return
def _process_data(self, raw, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) du = DipperUtil() gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (catalog_id, description, omim_number, sample_type, cell_line_available, dna_in_stock, dna_ref, gender, age, race, ethnicity, affected, karyotype, relprob, mutation, gene, family_id, collection, url, cat_remark, pubmed_ids, family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,, # 2,,18343,H**o sapiens if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:'+catalog_id.strip() # Map the cell/sample type cell_type = self._map_cell_type(sample_type) # Make a cell line label line_label = \ collection.partition(' ')[0]+'-'+catalog_id.strip() # Map the repository/collection repository = self._map_collection(collection) # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_person' if self.nobnodes: patient_id = ':'+patient_id if family_id != '': patient_id = \ '-'.join((patient_id, family_id, family_member)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id.strip())) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. short_desc = (description.split(';')[0]).capitalize() if affected == 'Yes': affected = 'affected' elif affected == 'No': affected = 'unaffected' gender = gender.lower() patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = \ ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = \ ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = 'CLO:0000031' gu.addIndividualToGraph( g, cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:'+dna_ref # some of the equivalent ids are not defined # in the source data; so add them gu.addIndividualToGraph( g, equiv_cell_line, None, cell_line_reagent_id) gu.addSameIndividual(g, cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository gu.addMember(g, repository, cell_line_id) if cat_remark != '': gu.addDescription(g, cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # g,age_id,age,self.terms['age']) # gu.addTriple( # g,age_id,self.properties['has_measurement'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. gu.addPerson(g, patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self._map_race(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.terms['race'],mapped_race) # gu.addSubclass( # g,self.terms['ethnic_group'],mapped_race) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if family_id != '': family_comp_id = 'CoriellFamily:'+family_id family_label = \ ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual gu.addIndividualToGraph( g, family_comp_id, family_label, geno.genoparts['family']) # Add the patient as a member of the family gu.addMemberOf(g, patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! if species is None or species == '': species = 'H**o sapiens' taxon = self._map_species(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None if dbsnp_id != '': genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip() omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = du.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = \ '_'+re.sub('MONARCH:', '', self.make_id(karyotype)) if self.nobnodes: karyotype_id = ':'+karyotype_id # add karyotype as karyotype_variation_complement gu.addIndividualToGraph( g, karyotype_id, karyotype, geno.genoparts['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = \ self._get_affected_chromosomes_from_karyotype( karyotype) for c in karyo_chrs: chr_id = makeChromID(c, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, c)) karyotype_feature_label = \ 'some karyotype alteration on chr'+str(c) f = Feature( karyotype_feature_id, karyotype_feature_label, geno.genoparts['sequence_alteration']) f.addFeatureStartLocation(None, chr_id) f.addFeatureToGraph(g) f.loadAllProperties(g) geno.addParts( karyotype_feature_id, karyotype_id, geno.object_properties['has_alternate_part']) if gene != '': vl = gene+'('+mutation+')' # fix the variant_id so it's always in the same order vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' \ and not self._is_normal_karyotype(karyotype): mutation = mutation.strip() gvc_id = karyotype_id if variant_id != '': gvc_id = '_' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass if gvc_id is not None and gvc_id != karyotype_id \ and self.nobnodes: gvc_id = ':'+gvc_id # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = geno.object_properties['has_alternate_part'] if self._is_normal_karyotype(karyotype): karyo_rel = \ geno.object_properties['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for v in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X m = re.match(r'(\d+)\.+(.*)', v.strip()) if m is not None and len(m.groups()) == 2: (locus_num, var_num) = m.groups() if locus_num is not None \ and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for o in omim_map: # gene_id = 'OMIM:' + o # TODO unused vslc_id = \ '_' + '-'.join( [o + '.' + a for a in omim_map.get(o)]) if self.nobnodes: vslc_id = ':'+vslc_id vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts[ 'variant_single_locus_complement']) for v in omim_map.get(o): # this is actually a sequence alt allele1_id = 'OMIM:'+o+'.'+v geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, geno.zygosity['indeterminate'], geno.object_properties[ 'has_alternate_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype gu.addType(g, patient_id, geno.genoparts['wildtype']) elif genotype_id is None: # make an anonymous genotype id genotype_id = '_geno'+catalog_id.strip() if self.nobnodes: genotype_id = ':'+genotype_id # add the gvc if gvc_id is not None: gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = \ geno.object_properties[ 'has_reference_part'] else: rel = \ geno.object_properties[ 'has_alternate_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = \ '; '.join((gvc_label, karyotype)) else: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, geno.object_properties[ 'has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, geno.genoparts['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient gu.addTriple( g, patient_id, geno.properties['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # ############# DEAL WITH THE DISEASES ############# # we associate the disease to the patient if affected == 'affected': if omim_number != '': for d in omim_number.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno if d not in omim_map: disease_id = 'OMIM:'+d.strip() # assume the label is taken care of gu.addClassToGraph(g, disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( self.name, patient_id, disease_id) assoc.add_association_to_graph(g) # this line is a model of this disease # TODO abstract out model into # it's own association class? gu.addTriple( g, cell_line_id, gu.properties['model_of'], disease_id) else: logger.info( 'removing %s from disease list ' + 'since it is a gene', d) # ############# ADD PUBLICATIONS ############# if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:'+s.strip() ref = Reference(pubmed_id) ref.setType(Reference.ref_types['journal_article']) ref.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.properties['mentions'], cell_line_id) if not self.testMode \ and (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:' + tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs' + str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:' + dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_' + gene_num + '-' + variant_num if self.nobnodes: vl_id = ':' + vl_id vl_label = allele_name model.addIndividualToGraph(vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info("No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub(m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub(r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub(r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc(g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:' + xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _transform_entry(self, e, graph): g = graph model = Model(g) geno = Genotype(graph) tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' build_num = "GRCh38" build_id = "NCBIGenome:"+build_num # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": # "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": # "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, # and add it as a synonym abbrev = None if len(re.split(r';', label)) > 1: abbrev = (re.split(r';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': model.addDeprecatedClass(omimid) else: omimtype = self._get_omimtype(e['entry']) nodelabel = newlabel # this uses our cleaned-up label if omimtype == Genotype.genoparts['heritable_phenotypic_marker']: if abbrev is not None: nodelabel = abbrev # in this special case, # make it a disease by not declaring it as a gene/marker model.addClassToGraph(omimid, nodelabel, None, newlabel) elif omimtype == Genotype.genoparts['gene']: if abbrev is not None: nodelabel = abbrev model.addClassToGraph(omimid, nodelabel, omimtype, newlabel) else: model.addClassToGraph(omimid, newlabel, omimtype) # add the original screaming-caps OMIM label as a synonym model.addSynonym(omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym') # for OMIM, we're adding the description as a definition model.addDefinition(omimid, description) if abbrev is not None: model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym') # if this is a genetic locus (but not sequenced) # then add the chrom loc info # but add it to the ncbi gene identifier, # not to the omim id (we reserve the omim id to be the phenotype) feature_id = None feature_label = None if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] is_gene = False if omimtype == \ Genotype.genoparts['heritable_phenotypic_marker']: # get the ncbigene ids ncbifeature = self._get_mapped_gene_ids(e['entry'], g) if len(ncbifeature) == 1: feature_id = 'NCBIGene:'+str(ncbifeature[0]) # add this feature as a cause for the omim disease # TODO SHOULD I EVEN DO THIS HERE? assoc = G2PAssoc(g, self.name, feature_id, omimid) assoc.add_association_to_graph() elif len(ncbifeature) > 1: logger.info( "Its ambiguous when %s maps to >1 gene id: %s", omimid, str(ncbifeature)) else: # no ncbi feature, make an anonymous one feature_id = self._make_anonymous_feature(str(omimnum)) feature_label = abbrev elif omimtype == Genotype.genoparts['gene']: feature_id = omimid is_gene = True else: # 158900 falls into this category feature_id = self._make_anonymous_feature(str(omimnum)) if abbrev is not None: feature_label = abbrev omimtype = \ Genotype.genoparts[ 'heritable_phenotypic_marker'] if feature_id is not None: if 'comments' in genemap: # add a comment to this feature comment = genemap['comments'] if comment.strip() != '': model.addDescription(feature_id, comment) if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. # add this omim thing as # a subsequence of the cytofeature # 18p11.3-p11.2 # FIXME # add the other end of the range, # but not sure how to do that # not sure if saying subsequence of feature # is the right relationship f = Feature(g, feature_id, feature_label, omimtype) if 'chromosomeSymbol' in genemap: chrom_num = str(genemap['chromosomeSymbol']) chrom = makeChromID(chrom_num, tax_num, 'CHR') geno.addChromosomeClass( chrom_num, tax_id, tax_label) # add the positional information, if available fstart = fend = -1 if 'chromosomeLocationStart' in genemap: fstart = genemap['chromosomeLocationStart'] if 'chromosomeLocationEnd' in genemap: fend = genemap['chromosomeLocationEnd'] if fstart >= 0: # make the build-specific chromosome chrom_in_build = makeChromID(chrom_num, build_num, 'MONARCH') # then, add the chromosome instance # (from the given build) geno.addChromosomeInstance( chrom_num, build_id, build_num, chrom) if omimtype == \ Genotype.genoparts[ 'heritable_phenotypic_marker']: postypes = [Feature.types['FuzzyPosition']] else: postypes = None # NOTE that no strand information # is available in the API f.addFeatureStartLocation( fstart, chrom_in_build, None, postypes) if fend >= 0: f.addFeatureEndLocation( fend, chrom_in_build, None, postypes) if fstart > fend: logger.info( "start>end (%d>%d) for %s", fstart, fend, omimid) # add the cytogenic location too # for now, just take the first one cytoloc = cytoloc.split('-')[0] loc = makeChromID(cytoloc, tax_num, 'CHR') model.addClassToGraph(loc, None) f.addSubsequenceOfFeature(loc) f.addFeatureToGraph(True, None, is_gene) # end adding causative genes/features # check if moved, if so, # make it deprecated and # replaced consider class to the other thing(s) # some entries have been moved to multiple other entries and # use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search(r'and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split(r'and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) model.addDeprecatedClass(omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_mapped_gene_ids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) # temp gag return
def _process_qtls_genomic_location( self, raw, txid, build_id, build_label, common_name, limit=None): """ This method Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) # assume that chrs get added to the genome elsewhere taxon_curie = 'NCBITaxon:' + txid eco_id = self.globaltt['quantitative trait analysis evidence'] LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: line_counter += 1 if re.match(r'^#', ' '.join(row)): continue (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row example = ''' Chr.Z Animal QTLdb Production_QTL 33954873 34023581... QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01 ''' str(example) # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for attributes in attr_items: if not re.search(r'=', attributes): # remove this attribute from the list bad_attrs.add(attributes) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.test_mode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait based on taxon qtl_id = common_name + 'QTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL']) geno.addTaxon(taxon_curie, qtl_id) # trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(graph, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): scr = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in scr: scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_curie) qtl_feature.addFeatureToGraph() if not self.test_mode and limit is not None and line_counter > limit: break # LOG.warning("Bad attribute flags in this file") # what does this even mean?? LOG.info("Done with QTL genomic mappings for %s", taxon_curie) return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): pass for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index( 'pubmed_id')].strip() # pipe separated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe separated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) elif symbol[ -1] == '@': # 10) region (HOX), RNA cluster, gene (PCDH) continue else: gene_type_id = self.resolve(locus_type, mandatory=False) if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple('PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and chr_match.groups(): chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and band_match.groups(): band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def process_feature_loc(self, limit): src_key = 'feature_loc' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing: %s", self.files[src_key]['file']) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:' + build_num col = self.files[src_key]['columns'] with gzip.open(raw, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: if re.match(r'\#', ''.join(row)): continue chrom = row[col.index('seqid')] # db = row[col.index('source')] feature_type_label = row[col.index('type')] start = row[col.index('start')] # end = row[col.index('end')] # score = row[col.index('score')] strand = row[col.index('strand')] # phase = row[col.index('phase')] attributes = row[col.index('attributes')] ''' I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) ''' # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat' ]: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue attribute_dict = {} if attributes != '': attributes.replace('"', '') attribute_dict = dict( tuple(atv.split('=')) for atv in attributes.split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict['ID'] if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: LOG.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:' + attribute_dict['variation'] flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution=' + sub if ins is not None: desc = 'insertion=' + ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for strn in strain_list.split(','): strn = strn.strip() if strn not in strain_to_variant_map: strain_to_variant_map[strn] = set() strain_to_variant_map[strn].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:' + name name = None else: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: model.addSynonym(fid, name) if desc is not None and desc != '': model.addDescription(fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: model.addSynonym(fid, other_name) if feature_type_label == 'gene': ftype_id = self.resolve(biotype) else: # so far, they all come with SO label syntax. resolve if need be. ftype_id = self.globaltt[feature_type_label] chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) feature = Feature(graph, fid, flabel, ftype_id) feature.addFeatureStartLocation(start, chr_id, strand) feature.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True feature.addFeatureToGraph(True, None, feature_is_class) if note is not None and note != '': model.addDescription(fid, note) if limit is not None and reader.line_num > limit: break # RNAi reagents: ''' I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH ''' # TODO TF binding sites and network: '''
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = 'AQTLTrait:' + trait_id.strip() # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.testMode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' mch = re.match(placed_scaffold_pattern + r'$', scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None } # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num + band_num]['stain'] = self.resolve(rtype) # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd else: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, taxon, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _process_genes(self, limit=None): if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass( hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass( hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': g.addTriple( 'PMID:' + str(p.strip()), model.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(g, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') model.addClassToGraph(band_id, None) f.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) f.addSubsequenceOfFeature(chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file return
def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info("Processing genetic location for %s", taxon_id) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = 'AQTL:'+qtl_id trait_id = 'AQTLTrait:'+trait_id # Add QTL to graph f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL']) f.addTaxonToFeature(g, taxon_id) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_id, 'CHR') # add a version of the chromosome which is defined as the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_id) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id) start = stop = None if re.search('-', range_cm): range_parts = re.split('-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '': (start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)] else: logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop when schema can handle floats # add in the genetic location based on the range f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureToGraph(g) # sometimes there's a peak marker, like a rsid. we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration']) gu.addXref(g, qtl_id, dbsnp_id) if gene_id is not None and gene_id != '' and gene_id != '.': if gene_id_src == 'NCBIgene' or gene_id_src == '': # we assume if no src is provided, it's NCBI gene_id = 'NCBIGene:'+gene_id.strip() geno.addGene(gene_id, None) # we will expect that these labels provided elsewhere geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation']) # FIXME what is the right relationship here? if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark if self.nobnodes: vl_id = ':' + vl_id geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) # add the trait gu.addClassToGraph(g, trait_id, trait_name) # Add publication r = None if re.match('ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() r = Reference(pub_id) elif pubmed_id != '': pub_id = 'PMID:'+pubmed_id.strip() r = Reference(pub_id, Reference.ref_types['journal_article']) if r is not None: r.addRefToGraph(g) # make the association to the QTL assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc(self.name, dbsnp_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) if not self.testMode and limit is not None and line_counter > limit: break logger.info("Done with QTL genetic info") return
def _process_qtls_genomic_location( self, raw, src_key, txid, build_id, build_label, common_name, limit=None): """ This method Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # assume that chrs get added to the genome elsewhere taxon_curie = 'NCBITaxon:' + txid eco_id = self.globaltt['quantitative trait analysis evidence'] LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw) with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") # no header in GFF, so no header checking col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if row[0][0] == '#': # LOG.info(row) continue if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing in %s row %s\n" "got %s cols but expected %s", raw, reader.line_num, len(row), col_len) LOG.info(row) continue chromosome = row[col.index('SEQNAME')].strip() # qtl_source = row[col.index('SOURCE')].strip() # qtl_type = row[col.index('FEATURE')].strip() start_bp = row[col.index('START')].strip() stop_bp = row[col.index('END')].strip() # score = row[col.index('SCORE')].strip() strand = row[col.index('STRAND')].strip() # frame = row[col.index('FRAME')].strip() attr = row[col.index('ATTRIBUTE')].strip() example = ''' Chr.Z Animal QTLdb Production_QTL 33954873 34023581... QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01 ''' str(example) # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers, # VTO_name,Map_Type,Significance,P-value,Model, # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM, # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect, # Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search(r'"FlankMarkers";', attr): attr = re.sub(r'FlankMarkers;', '', attr) attr_items = re.sub(r'"', '', attr).split(";") bad_attrs = set() for attributes in attr_items: if not re.search(r'=', attributes): # remove this attribute from the list bad_attrs.add(attributes) attr_set = set(attr_items) - bad_attrs attribute_dict = dict(item.split("=") for item in attr_set) qtl_num = attribute_dict.get('QTL_ID') if self.test_mode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait based on taxon qtl_id = common_name + 'QTL:' + str(qtl_num) model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL']) geno.addTaxon(taxon_curie, qtl_id) # trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match(r'ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() reference = Reference(graph, pub_id) else: pub_id = 'PMID:' + pub_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) reference.addRefToGraph() # Add QTL to graph assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): scr = re.sub(r'<', '', attribute_dict.get('P-value')) if ',' in scr: scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) assoc.add_association_to_graph() # TODO make association to breed # (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub(r'Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation( start_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation( stop_bp, chrom_in_build_id, strand, [self.globaltt['FuzzyPosition']]) qtl_feature.addTaxonToFeature(taxon_curie) qtl_feature.addFeatureToGraph() if not self.test_mode and limit is not None and reader.line_num > limit: break # LOG.warning("Bad attribute flags in this file") # what does this even mean?? LOG.info("Done with QTL genomic mappings for %s", taxon_curie)
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[common_name + '_cm']['curie'] if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _transform_entry(self, ent, graph): self.graph = graph model = Model(graph) geno = Genotype(graph) tax_label = 'H**o sapiens' tax_id = self.globaltt[tax_label] build_num = "GRCh38" asm_curie = ':'.join(('NCBIAssembly', build_num)) # get the numbers, labels, and descriptions omim_num = str(ent['entry']['mimNumber']) titles = ent['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # remove the abbreviation (comes after the ;) from the preferredTitle, abbrev = None lab_lst = label.split(';') if len(lab_lst) > 1: abbrev = lab_lst[1].strip() newlabel = self._cleanup_label(label) omim_curie = 'OMIM:' + omim_num omimtype = self.omim_type[omim_num] nodelabel = newlabel # this uses our cleaned-up label if omimtype == self.globaltt['heritable_phenotypic_marker']: if abbrev is not None: nodelabel = abbrev # in this special case, # make it a disease by not declaring it as a gene/marker # ??? and if abbrev is None? model.addClassToGraph(omim_curie, nodelabel, description=newlabel) # class_type=self.globaltt['disease or disorder'], elif omimtype in [ self.globaltt['gene'], self.globaltt['has_affected_feature'] ]: omimtype = self.globaltt['gene'] if abbrev is not None: nodelabel = abbrev # omim is subclass_of gene (provide type term) model.addClassToGraph(omim_curie, nodelabel, self.globaltt['gene'], newlabel) else: # omim is NOT subclass_of D|P|or ?... model.addClassToGraph(omim_curie, newlabel) # KS: commenting out, we will get disease descriptions # from MONDO, and gene descriptions from the mygene API # if this is a genetic locus (not sequenced) then # add the chrom loc info to the ncbi gene identifier, # not to the omim id (we reserve the omim id to be the phenotype) ################################################################# # the above makes no sense to me. (TEC) # For Monarch, OMIM is authoritative for disease / phenotype # if they say a phenotype is associated with a locus # that is what dipper should report. # OMIM is not authoritative for NCBI gene locations, locus or otherwise. # and dipper should not be reporting gene locations via OMIM. feature_id = None feature_label = None if 'geneMapExists' in ent['entry'] and ent['entry']['geneMapExists']: genemap = ent['entry']['geneMap'] is_gene = False if omimtype == self.globaltt['heritable_phenotypic_marker']: # get the ncbigene ids ncbifeature = self._get_mapped_gene_ids(ent['entry'], graph) if len(ncbifeature) == 1: feature_id = 'NCBIGene:' + str(ncbifeature[0]) # add this feature as a cause for the omim disease # TODO SHOULD I EVEN DO THIS HERE? assoc = G2PAssoc(graph, self.name, feature_id, omim_curie) assoc.add_association_to_graph() else: LOG.info( "Its ambiguous when %s maps to not one gene id: %s", omim_curie, str(ncbifeature)) elif omimtype in [ self.globaltt['gene'], self.globaltt['has_affected_feature'] ]: feature_id = omim_curie is_gene = True omimtype = self.globaltt['gene'] else: # 158900 falls into this category feature_id = self._make_anonymous_feature(omim_num) if abbrev is not None: feature_label = abbrev omimtype = self.globaltt['heritable_phenotypic_marker'] if feature_id is not None: if 'comments' in genemap: # add a comment to this feature comment = genemap['comments'] if comment.strip() != '': model.addDescription(feature_id, comment) if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. # add this omim thing as # a subsequence of the cytofeature # 18p11.3-p11.2 # FIXME # add the other end of the range, # but not sure how to do that # not sure if saying subsequence of feature # is the right relationship feat = Feature(graph, feature_id, feature_label, omimtype) if 'chromosomeSymbol' in genemap: chrom_num = str(genemap['chromosomeSymbol']) chrom = makeChromID(chrom_num, tax_id, 'CHR') geno.addChromosomeClass(chrom_num, self.globaltt['H**o sapiens'], tax_label) # add the positional information, if available fstart = fend = -1 if 'chromosomeLocationStart' in genemap: fstart = genemap['chromosomeLocationStart'] if 'chromosomeLocationEnd' in genemap: fend = genemap['chromosomeLocationEnd'] if fstart >= 0: # make the build-specific chromosome chrom_in_build = makeChromID( chrom_num, build_num, 'MONARCH') # then, add the chromosome instance # (from the given build) geno.addChromosomeInstance(chrom_num, asm_curie, build_num, chrom) if omimtype == self.globaltt[ 'heritable_phenotypic_marker']: postypes = [self.globaltt['FuzzyPosition']] else: postypes = None # NOTE that no strand information # is available in the API feat.addFeatureStartLocation( fstart, chrom_in_build, None, postypes) if fend >= 0: feat.addFeatureEndLocation( fend, chrom_in_build, None, postypes) if fstart > fend: LOG.info("start>end (%d>%d) for %s", fstart, fend, omim_curie) # add the cytogenic location too # for now, just take the first one cytoloc = cytoloc.split('-')[0] loc = makeChromID(cytoloc, tax_id, 'CHR') model.addClassToGraph(loc, None) feat.addSubsequenceOfFeature(loc) feat.addFeatureToGraph(True, None, is_gene) # end adding causative genes/features if ent['entry']['status'] in ['moved', 'removed']: LOG.warning('UNEXPECTED! not expecting obsolete record %s', omim_curie) self._get_phenotypicseries_parents(ent['entry'], graph) self._get_mappedids(ent['entry'], graph) self._get_mapped_gene_ids(ent['entry'], graph) self._get_pubs(ent['entry'], graph) self._get_process_allelic_variants(ent['entry'], graph)
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:'+build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = \ placed_scaffold_pattern+r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' m = re.match(placed_scaffold_pattern+r'$', scaffold) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = m.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if m: pass elif m_chr_unloc is not None and\ len(m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num+'_'+m_chr_unloc.group(2) elif m_chr_unplaced is not None and\ len(m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': Feature.types['chromosome']} if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': Feature.types['assembly_component'], 'synonym': scaffold} if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num+band_num] = {'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None} # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num+band_num]['stain'] = \ Feature.types.get(rtype) # get the parent bands, and make them unique parents = list( monochrom.make_parent_bands(band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num+band_num]['parent'] = \ chrom_num+parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num+parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash b = {'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti} mybands[pnum] = b else: # band already in the hash means it's a grouping band # need to update the min/max coords b = mybands.get(pnum) b['min'] = min(sta, sto, b['min']) b['max'] = max(sta, sto, b['max']) mybands[pnum] = b # also, set the max for the chrom c = mybands.get(chrom_num) c['max'] = max(sta, sto, c['max']) mybands[chrom_num] = c # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num+parents[i+1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for b in mybands.keys(): myband = mybands.get(b) band_class_id = makeChromID(b, taxon, 'CHR') band_class_label = makeChromLabel(b, genome_label) band_build_id = makeChromID(b, build_num, 'MONARCH') band_build_label = makeChromLabel(b, build_num) # the build-specific chrom chrom_in_build_id = makeChromID( myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != Feature.types['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == Feature.types['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == Feature.types['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: # TODO 'has_staining_intensity' being dropped by MB bfeature.addFeatureProperty( Feature.properties['has_staining_intensity'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # not unzipping the file logger.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", gene_info) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) # tax label can get added elsewhere geno.addGenome(tax_id, str(tax_num)) # label added elsewhere model.addClassToGraph(tax_id, None) with gzip.open(gene_info, 'rb') as f: row = f.readline().decode().strip().split('\t') logger.info("Header has %i columns", len(row)) for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date, feature_type) = line.split('\t') # ##set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self.map_type_of_gene(gtype.strip()) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == 'SO:0000110': self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.testMode and \ limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader, # so we will not add the leader designation here. else: model.addIndividualToGraph( gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader. if name != '-': model.addSynonym(gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if xrefs.strip() != '-': self._add_gene_equivalencies(xrefs, gene_id, tax_num) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if str(chrom) != '-' and str(chrom) != '': if re.search(r'\|', str(chrom)) and \ str(chrom) not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping logger.info( '%s is non-uniquely mapped to %s.' + ' Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chrom) == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split(r'\|', str(chrom)): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(c, tax_id, None) mychrom = makeChromID(c, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(c, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match( r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and \ len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^'+c, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(g, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 logger.debug( 'not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): exit(-1) for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index('pubmed_id')].strip() # pipe seperated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe seperated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple( 'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # not unzipping the file logger.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", gene_info) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) # tax label can get added elsewhere geno.addGenome(tax_id, str(tax_num)) # label added elsewhere model.addClassToGraph(tax_id, None) with gzip.open(gene_info, 'rb') as f: row = f.readline().decode().strip().split('\t') logger.info("Header has %i columns", len(row)) for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date, feature_type) = line.split('\t') # ##set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self.map_type_of_gene(gtype.strip()) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == 'SO:0000110': self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.testMode and \ limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader, # so we will not add the leader designation here. else: model.addIndividualToGraph(gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader. if name != '-': model.addSynonym(gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): model.addSynonym( gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if xrefs.strip() != '-': self._add_gene_equivalencies(xrefs, gene_id, tax_num) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if str(chrom) != '-' and str(chrom) != '': if re.search(r'\|', str(chrom)) and \ str(chrom) not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping logger.info( '%s is non-uniquely mapped to %s.' + ' Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chrom) == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split(r'\|', str(chrom)): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(c, tax_id, None) mychrom = makeChromID(c, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(c, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and \ len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^' + c, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(c + bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(g, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome g.addTriple( gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) return
def _get_chrbands(self, limit, src_key, genome_id): """ :param limit: :return: """ tax_num = src_key if limit is None: limit = sys.maxsize # practical limit anyway model = Model(self.graph) line_num = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[src_key]['genome_label'] taxon_curie = 'NCBITaxon:' + tax_num species_name = self.globaltcid[taxon_curie] # for logging # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_curie, None) model.addSynonym(taxon_curie, genome_label) geno.addGenome(taxon_curie, genome_label, genome_id) # add the build and the taxon it's in build_num = self.files[src_key]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_curie) # cat (at least) also has chr[BDAECF]... hex? must be a back cat. if tax_num == self.localtt['Felis catus']: placed_scaffold_regex = re.compile( r'(chr(?:[BDAECF]\d+|X|Y|Z|W|M|))$') else: placed_scaffold_regex = re.compile(r'(chr(?:\d+|X|Y|Z|W|M))$') unlocalized_scaffold_regex = re.compile(r'_(\w+)_random') unplaced_scaffold_regex = re.compile(r'chr(Un(?:_\w+)?)') # process the bands col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as binreader: for line in binreader: line_num += 1 # skip comments line = line.decode().strip() if line[0] == '#' or line_num > limit: continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') scaffold = row[col.index('chrom')].strip() start = row[col.index('chromStart')] stop = row[col.index('chromEnd')] band_num = row[col.index('name')].strip() rtype = row[col.index('gieStain')] # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = placed_scaffold_regex.match(scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold at the class level # LOG.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = unlocalized_scaffold_regex.match(scaffold) m_chr_unplaced = unplaced_scaffold_regex.match(scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) # else: # LOG.error( # "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, tax_num, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_curie, self.files[src_key]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } elif scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } else: LOG.info('%s line %i DROPPED chromosome/scaffold %s', species_name, line_num, scaffold) parents = list() # see it new types have showed up if rtype is not None and rtype not in [ 'gneg', 'gpos25', 'gpos33', 'gpos50', 'gpos66', 'gpos75', 'gpos100', 'acen', 'gvar', 'stalk' ]: LOG.info('Unknown gieStain type "%s" in %s at %i', rtype, src_key, line_num) self.globaltt[rtype] # blow up if rtype == 'acen': # hacky, revisit if ontology improves rtype = self.localtt[rtype] if band_num is not None and band_num != '' and \ rtype is not None and rtype != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt[rtype], } # add the staining intensity of the band # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] # else: # band has no parents # loop through the parents and add them to the dict # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i], self.graph) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum is not None and pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd elif pnum is not None: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom else: LOG.error("pnum is None") # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num binreader.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, tax_num, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False)
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:' + build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat' ]: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=") for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:' + attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution=' + sub if ins is not None: desc = 'insertion=' + ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:' + name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: model.addSynonym(fid, name) if desc is not None: model.addDescription(fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: model.addSynonym(fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) feature = Feature(g, fid, flabel, ftype) feature.addFeatureStartLocation(start, chr_id, strand) feature.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True feature.addFeatureToGraph(True, None, feature_is_class) if note is not None: model.addDescription(fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # not unzipping the file logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", myfile) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chr, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self._map_type_of_gene(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # TODO might have to figure out if things aren't genes, and make them individuals gu.addClassToGraph(g, gene_id, label, gene_type_id, desc) # we have to do special things here for genes, because they're classes not individuals # f = Feature(gene_id,label,gene_type_id,desc) if name != '-': gu.addSynonym(g, gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 if xrefs.strip() != '-': for r in xrefs.strip().split('|'): fixedr = self._cleanup_id(r) if fixedr is not None and fixedr.strip() != '': if re.match('HPRD', fixedr): # proteins are not == genes. gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr) else: # skip some of these for now if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']: gu.addEquivalentClass(g, gene_id, fixedr) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility # 101928066 LOC101928066 1|Un - # unlocated scaffold # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1 # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table when there is > 1 listed # with the exception of human X|Y, i will only take those that align to one chr # FIXME remove the chr mapping below when we pull in the genomic coords if str(chr) != '-' and str(chr) != '': if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']: # this means that there's uncertainty in the mapping. skip it # TODO we'll need to figure out how to deal with >1 loc mapping logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chr) == 'X; Y': chr = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split('\|',str(chr)) : geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere mychrom = makeChromID(c, tax_num, 'CHR') mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label gu.addSynonym(g, mychrom, mychrom_syn) band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, so make that kind of band # not sure why this matches? chrX|Y or 10090chr12|Un" # TODO we probably need a different regex per organism # the maploc_id already has the numeric chromosome in it, strip it first bid = re.sub('^'+c, '', map_loc) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates) # print(map_loc,'-->',bid,'-->',maploc_id) band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere band.addFeatureToGraph(g) # add the band as the containing feature gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24, ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _process_QTLs_genomic_location(self, raw, taxon_id, build_id, build_label, limit=None): """ This method Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) line_counter = 0 geno = Genotype(g) genome_id = geno.makeGenomeID(taxon_id) # assume that chrs get added to the genome elsewhere eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: line_counter += 1 if re.match('^#', ' '.join(row)): continue (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row # Chr.Z Animal QTLdb Production_QTL 33954873 34023581 . . . # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234; # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass"; # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian"; # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52"; # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01" # make dictionary of attributes # keys are: # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait, # FlankMarkers,VTO_name,Map_Type,Significance,P-value,Model,Test_Base,Variance, # Bayes-value,PTO_name,gene_IDsrc,peak_cM,CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect, # Dominance_Effect,Likelihood_Ratio,LS-means,Breed, # trait (duplicate with Name),Variance,Bayes-value, # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,Likelihood_Ratio,LS-means # deal with poorly formed attributes if re.search('"FlankMarkers";', attr): attr = re.sub('"FlankMarkers";', '', attr) attr_items = re.sub('"', '', attr).split(";") bad_attr_flag = False for a in attr_items: if not re.search('=', a): bad_attr_flag = True if bad_attr_flag: logger.error("Poorly formed data on line %d:\n %s", line_counter, '\t'.join(row)) continue attribute_dict = dict(item.split("=") for item in re.sub('"', '', attr).split(";")) qtl_num = attribute_dict.get('QTL_ID') if self.testMode and int(qtl_num) not in self.test_ids: continue # make association between QTL and trait qtl_id = 'AQTL:' + str(qtl_num) gu.addIndividualToGraph(g, qtl_id, None, geno.genoparts['QTL']) geno.addTaxon(taxon_id, qtl_id) trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID') # if pub is in attributes, add it to the association pub_id = None if 'PUBMED_ID' in attribute_dict.keys(): pub_id = attribute_dict.get('PUBMED_ID') if re.match('ISU.*', pub_id): pub_id = 'AQTLPub:' + pub_id.strip() p = Reference(pub_id) else: pub_id = 'PMID:' + pub_id.strip() p = Reference(pub_id, Reference.ref_types['journal_article']) p.addRefToGraph(g) # Add QTL to graph assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) if 'P-value' in attribute_dict.keys(): score = float(re.sub('<', '', attribute_dict.get('P-value'))) assoc.set_score(score) assoc.add_association_to_graph(g) # TODO make association to breed (which means making QTL feature in Breed background) # get location of QTL chromosome = re.sub('Chr\.', '', chromosome) chrom_id = makeChromID(chromosome, taxon_id, 'CHR') chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id) qtl_feature = Feature(qtl_id, None, geno.genoparts['QTL']) if start_bp == '': start_bp = None qtl_feature.addFeatureStartLocation(start_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) if stop_bp == '': stop_bp = None qtl_feature.addFeatureEndLocation(stop_bp, chrom_in_build_id, strand, [Feature.types['FuzzyPosition']]) qtl_feature.addTaxonToFeature(g, taxon_id) qtl_feature.addFeatureToGraph(g) if not self.testMode and limit is not None and line_counter > limit: break logger.info("Done with QTL genomic mappings for %s", taxon_id) return
def _process_data(self, source, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[source]['file'])) LOG.info("Processing Data from %s", raw) if self.testMode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[source]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph(family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': vl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for d in omim_num.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if d not in omim_map: disease_id = 'OMIM:' + d.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', d) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:' + s.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.testMode and (limit is not None and line_counter > limit): break return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivalent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ src_key = 'gene_info' if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # not unzipping the file LOG.info("Processing 'Gene Info' records") line_counter = 0 gene_info = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("FILE: %s", gene_info) LOG.info('Add taxa and genome classes for those in our filter') band_regex = re.compile(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$') for tax_num in self.tax_ids: tax_curie = ':'.join(('NCBITaxon', tax_num)) # tax label can get added elsewhere geno.addGenome(tax_curie, tax_num) # label added elsewhere model.addClassToGraph(tax_curie, None) col = self.files[src_key]['columns'] LOG.info('Begin reading & parsing') with gzip.open(gene_info, 'rb') as tsv: row = tsv.readline().decode().strip().split('\t') row[0] = row[0][1:] # strip comment char if not self.check_fileheader(col, row): pass for line in tsv: line = line.strip() line_counter += 1 if line[0] == '#': # skip comments continue row = line.decode().strip().split('\t') # ##set filter=None in init if you don't want to have a filter # if self.id_filter is not None: # if ((self.id_filter == 'taxids' and \ # (tax_num not in self.tax_ids)) # or (self.id_filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter tax_num = row[col.index('tax_id')] gene_num = row[col.index('GeneID')] symbol = row[col.index('Symbol')] # = row[col.index('LocusTag')] synonyms = row[col.index('Synonyms')].strip() dbxrefs = row[col.index('dbXrefs')].strip() chrom = row[col.index('chromosome')].strip() map_loc = row[col.index('map_location')].strip() desc = row[col.index('description')] gtype = row[col.index('type_of_gene')].strip() # = row[col.index('Symbol_from_nomenclature_authority')] name = row[col.index('Full_name_from_nomenclature_authority')] # = row[col.index('Nomenclature_status')] other_designations = row[col.index( 'Other_designations')].strip() # = row[col.index('Modification_date')} # = row[col.index('Feature_type')] if self.test_mode and int(gene_num) not in self.gene_ids: continue if not self.test_mode and tax_num not in self.tax_ids: continue tax_curie = ':'.join(('NCBITaxon', tax_num)) gene_id = ':'.join(('NCBIGene', gene_num)) gene_type_id = self.resolve(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # sequence feature, not a gene if gene_type_id == self.globaltt['sequence_feature']: self.class_or_indiv[gene_id] = 'I' else: self.class_or_indiv[gene_id] = 'C' if not self.test_mode and limit is not None and line_counter > limit: continue if self.class_or_indiv[gene_id] == 'C': model.addClassToGraph(gene_id, label, gene_type_id, desc) # NCBI will be the default leader (for non mods), # so we will not add the leader designation here. else: model.addIndividualToGraph(gene_id, label, gene_type_id, desc) # in this case, they aren't genes. # so we want someone else to be the leader if name != '-': model.addSynonym(gene_id, name) if synonyms != '-': for syn in synonyms.split('|'): syn = syn.strip() # unknown curies may occur here if syn[:12] == 'AnimalQTLdb:' and \ tax_curie in self.informal_species: syn = self.informal_species[ tax_curie] + 'QTL:' + syn[12:] LOG.info('AnimalQTLdb: CHANGED to: %s', syn) model.addSynonym(gene_id, syn, model.globaltt['has_related_synonym']) if other_designations != '-': for syn in other_designations.split('|'): model.addSynonym(gene_id, syn.strip(), model.globaltt['has_related_synonym']) if dbxrefs != '-': self._add_gene_equivalencies(dbxrefs, gene_id, tax_curie) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # no idea why there's two bands listed - possibly 2 assemblies # 419 ART3 4 with 4q21.1|4p15.1-p14 # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # this is of "unknown" type == susceptibility # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 # unlocated scaffold # 101928066 LOC101928066 1|Un -\ # mouse --> 2C3 # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 11B1.1 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table # when there is > 1 listed # with the exception of human X|Y, # we will only take those that align to one chr # FIXME remove the chr mapping below # when we pull in the genomic coords if chrom != '-' and chrom != '': if re.search(r'\|', chrom) and chrom not in ['X|Y', 'X; Y']: # means that there's uncertainty in the mapping. # so skip it # TODO we'll need to figure out how to deal with # >1 loc mapping LOG.info( '%s is non-uniquely mapped to %s. Skipping for now.', gene_id, chrom) continue # X|Y Xp22.33;Yp11.3 # if(not re.match( # r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if chrom == 'X; Y': chrom = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for chromosome in re.split(r'\|', chrom): # assume that the chromosome label is added elsewhere geno.addChromosomeClass(chromosome, tax_curie, None) mychrom = makeChromID(chromosome, tax_num, 'CHR') # temporarily use taxnum for the disambiguating label mychrom_syn = makeChromLabel(chromosome, tax_num) model.addSynonym(mychrom, mychrom_syn) band_match = re.match(band_regex, map_loc) if band_match is not None and len( band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, # so make that kind of band # not sure why this matches? # chrX|Y or 10090chr12|Un" # TODO we probably need a different regex # per organism # the maploc_id already has the numeric chromosome # in it, strip it first bid = re.sub(r'^' + chromosome, '', map_loc) # the generic location (no coordinates) maploc_id = makeChromID(chromosome + bid, tax_num, 'CHR') # print(map_loc,'-->',bid,'-->',maploc_id) # Assume it's type will be added elsewhere band = Feature(graph, maploc_id, None, None) band.addFeatureToGraph() # add the band as the containing feature graph.addTriple(gene_id, self.globaltt['is subsequence of'], maploc_id) else: # TODO handle these cases: examples are: # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24, # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1, # 12cen-q21,22q13.3|22q13.3 LOG.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome graph.addTriple(gene_id, self.globaltt['is subsequence of'], mychrom) geno.addTaxon(tax_curie, gene_id)
def _process_genes(self, limit=None): if self.testMode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' and \ int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': graph.addTriple('PMID:' + str(p.strip()), self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.testMode and limit is not None and line_counter > limit: break # end loop through file return
def _process_qtls_genetic_location( self, raw, src_key, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[src_key]['curie'] common_name = common_name.strip() if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header in these files, so no header checking col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing %s line %i containing: \n%s\n" "got %i cols but expected %i", raw, reader.line_num, row, len(row), col_len) # LOG.info(row) continue qtl_id = row[col.index('QTL_ID')].strip() qtl_symbol = row[col.index('QTL_symbol')].strip() trait_name = row[col.index('Trait_name')].strip() # assotype = row[col.index('assotype')].strip() chromosome = row[col.index('Chromosome')].strip() position_cm = row[col.index('Position_cm')].strip() range_cm = row[col.index('range_cm')].strip() # flankmark_a2 = row[col.index('FlankMark_A2')].strip() # flankmark_a1 = row[col.index('FlankMark_A1')].strip() peak_mark = row[col.index('Peak_Mark')].strip() # flankmark_b1 = row[col.index('FlankMark_B1')].strip() # flankmark_b2 = row[col.index('FlankMark_B2')].strip() # exp_id = row[col.index('Exp_ID')].strip() # model_id = row[col.index('Model')].strip() # test_base = row[col.index('testbase')].strip() # sig_level = row[col.index('siglevel')].strip() # lod_score = row[col.index('LOD_score')].strip() # ls_mean = row[col.index('LS_mean')].strip() p_values = row[col.index('P_values')].strip() # f_statistics = row[col.index('F_Statistics')].strip() # variance = row[col.index('VARIANCE')].strip() # bayes_value = row[col.index('Bayes_value')].strip() # likelihood_ratio = row[col.index('LikelihoodR')].strip() trait_id = row[col.index('TRAIT_ID')].strip() # dom_effect = row[col.index('Dom_effect')].strip() # add_effect = row[col.index('Add_effect')].strip() pubmed_id = row[col.index('PUBMED_ID')].strip() gene_id = row[col.index('geneID')].strip() gene_id_src = row[col.index('geneIDsrc')].strip() # gene_id_type = row[col.index('geneIDtype')].strip() if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:' + common_name + '-linkage' build_label = common_name + ' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:' + peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref( qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant']) gene_id = gene_id.replace('uncharacterized ', '').strip() gene_id = gene_id.strip(',') # for "100157483," in pig_QTLdata.txt if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id as a bnode vl_id = self.make_id(re.sub( r':', '', gene_id) + '-' + peak_mark.strip(), '_') geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph( trait_id, trait_name, class_category=blv.terms['PhenotypicFeature']) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:' + pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # off by one - the following actually gives us (limit + 1) records if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with QTL genetic info")
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:'+build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat']: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=")for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:'+attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution='+sub if ins is not None: desc = 'insertion='+ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:'+name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: gu.addSynonym(g, fid, name) if desc is not None: gu.addDescription(g, fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: gu.addSynonym(g, fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) f = Feature(fid, flabel, ftype) f.addFeatureStartLocation(start, chr_id, strand) f.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True f.addFeatureToGraph(g, True, None, feature_is_class) if note is not None: gu.addDescription(g, fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers omimparams = { 'format': 'json', 'include': 'all', } # you will need to add the API key into the conf.json file, like: # keys : { 'omim' : '<your api key here>' } omimparams.update({'apiKey': config.get_config()['keys']['omim']}) # http://api.omim.org/api/entry?mimNumber=100100&include=all if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) it = 0 # for counting # note that you can only do request batches of 20 # see info about "Limits" at http://omim.org/help/api groupsize = 20 if not self.testMode and limit is not None: # just in case the limit is larger than the number of records, max it out max = min((limit, omimids.__len__())) else: max = omimids.__len__() # max = 10 #for testing # TODO write the json to local files - make the assumption that downloads within 24 hrs are the same # now, loop through the omim numbers and pull the records as json docs while it < max: end = min((max, it+groupsize)) # iterate through the omim ids list, and fetch from the OMIM api in batches of 20 if self.testMode: intersect = list(set([str(i) for i in self.test_ids]) & set(omimids[it:end])) if len(intersect) > 0: # some of the test ids are in the omimids logger.info("found test ids: %s", intersect) omimparams.update({'mimNumber': ','.join(intersect)}) else: it += groupsize continue else: omimparams.update({'mimNumber': ','.join(omimids[it:end])}) p = urllib.parse.urlencode(omimparams) url = '/'.join((self.OMIM_API, 'entry'))+'?%s' % p logger.info('fetching: %s', '/'.join((self.OMIM_API, 'entry'))+'?%s' % p) # ### if you want to test a specific entry number, uncomment the following code block # if ('101600' in omimids[it:end]): #104000 # print("FOUND IT in",omimids[it:end]) # else: # #testing very specific record # it+=groupsize # continue # ### end code block for testing # print ('fetching:',(',').join(omimids[it:end])) # print('url:',url) d = urllib.request.urlopen(url) resp = d.read().decode() request_time = datetime.now() it += groupsize myjson = json.loads(resp) entries = myjson['omim']['entryList'] geno = Genotype(g) # add genome and taxon tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere for e in entries: # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, and add it as a synonym abbrev = None if len(re.split(';', label)) > 1: abbrev = (re.split(';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': gu.addDeprecatedClass(g, omimid) else: omimtype = self._get_omimtype(e['entry']) # this uses our cleaned-up label gu.addClassToGraph(g, omimid, newlabel, omimtype) # add the original OMIM label as a synonym gu.addSynonym(g, omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: gu.addSynonym(g, omimid, l) # for OMIM, we're adding the description as a definition gu.addDefinition(g, omimid, description) if abbrev is not None: gu.addSynonym(g, omimid, abbrev) # if this is a genetic locus (but not sequenced) then add the chrom loc info if omimtype == Genotype.genoparts['biological_region']: if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. add this omim thing as a subsequence of the cytofeature # 18p11.3-p11.2 # for now, just take the first one # FIXME add the other end of the range, but not sure how to do that # not sure if saying subsequence of feature is the right relationship cytoloc = cytoloc.split('-')[0] f = Feature(omimid, None, None) if 'chromosome' in genemap: chrom = makeChromID(str(genemap['chromosome']), tax_num, 'CHR') geno.addChromosomeClass(str(genemap['chromosome']), tax_id, tax_label) loc = makeChromID(cytoloc, tax_num, 'CHR') gu.addClassToGraph(g, loc, cytoloc) # this is the chr band f.addSubsequenceOfFeature(g, loc) f.addFeatureToGraph(g) pass # check if moved, if so, make it deprecated and replaced/consider class to the other thing(s) # some entries have been moved to multiple other entries and use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search('and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split('and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) gu.addDeprecatedClass(g, omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) ### end iterating over batch of entries # can't have more than 4 req per sec, # so wait the remaining time, if necessary dt = datetime.now() - request_time rem = 0.25 - dt.total_seconds() if rem > 0: logger.info("waiting %d sec", rem) time.sleep(rem/1000) gu.loadAllProperties(g) return