def print_vkey(self, snpcnt): DBINDEX = ['EXAC','SNPDB','KGDB','ESP','CLINVARDB','CLINVITAE',\ 'REGULOME','CLNPHESNP','HGMDDB',\ 'COSMIC','NSFP','SPLICE','MIRNA','OMIM'] if self.vKeys: for cVkey in self.vKeys.itervalues(): cols = [ snpcnt, cVkey.chrom, cVkey.pos, cVkey.ref, cVkey.alt, lib_utils.joined(cVkey.rsids, ',') ] for tb in DBINDEX: tbIdx = cVkey.dbs[tb] if isinstance(tbIdx, list): tbIdx = lib_utils.joined(list(set(tbIdx)), ',') if not tbIdx: tbIdx = 'NULL' cols.append(tbIdx) else: if not tbIdx: tbIdx = 'NULL' cols.append(tbIdx) self.fpw.write('%s\n' % (lib_utils.joined(cols, '\t'))) del cols self.cleanup()
def to_file(rows,Header,out_fn,fmode='wb'): #check if out_fn can be writable fp2 = anyopen.openfile(out_fn,fmode) if isinstance(Header,basestring): headStr = Header else: headStr = lib_utils.joined(Header,'\t') fp2.write('#%s\n'%headStr) if len(rows)>0: decimal_idx = get_decimal_idx(rows[0]) fix_record = False if len(decimal_idx)>0: fix_record = True for i, r in enumerate(rows): r = list(r) if fix_record: r = reformat_fields(r, decimal_idx) fp2.write('%s\n'%lib_utils.joined(r,'\t')) fp2.close()
def gdna_to_vcf(self, mutalyzer_batch_outfn): if not os.path.exists(mutalyzer_batch_outfn): raise RuntimeError('check if input file [%s] exists'%\ mutalyzer_batch_outfn) cHgvs = Hgvs2() cHgvs.load_resource() fp = open(mutalyzer_batch_outfn, 'r') fp.next() gdna_cache = {} for mutalyzer in fp: mut = mutalyzer.split('\t') if mut[1].strip(): continue gdna = mut[2].strip() variants = cHgvs.gdna_to_vcf(gdna) if variants: gdna_cache[mut[0].strip()] = variants fp.close() self.out_vcf = lib_utils.file_tag(self.tsv, None, 'vcf') tmp_vcf = self.out_vcf + '.tmp' fpw = open(tmp_vcf, 'w') self._write_vcf_head(fpw) qual = 100 filter = 'PASS' rsid = '.' for cvt in self._iterfile(): if self.may_pass(cvt): continue if cvt.nt_change not in gdna_cache: continue for chrom, pos, ref, alt in gdna_cache[cvt.nt_change]: if len(ref) > 100 or len(alt) > 100: continue info = 'cDNA=%s;' % cvt.nt_change info += 'VC=%s;' % self.determine_vclass(cvt.rep_class) info += 'SRC=%s;' % cvt.source info += 'UPD=%s;' % cvt.last_upd info += 'URL=%s' % cvt.url if chrom.startswith('chr'): if chrom.startswith('chrM'): chrom = 'MT' else: chrom = chrom[3:] cols = [chrom, pos, rsid, ref, alt, qual, filter, info] fpw.write('%s\n' % lib_utils.joined(cols, '\t')) fpw.close() lib_utils.sort_tsv_by_col2(tmp_vcf,[1,2],\ ['V','n'],False,self.out_vcf) os.unlink(tmp_vcf)
def print_header(self, tsv): DBINDEX = [['EXAC', 'snps', 'idx', []], ['SNPDB', 'snps', 'idx', []], ['KGDB', 'snps', 'idx', []], ['ESP', 'snps', 'idx', []], ['CLINVARDB', 'snps', 'idx', []], ['CLINVITAE', 'snps', 'idx', []], ['REGULOME', 'regulome', 'idx', []], ['CLNPHESNP', 'clnsnp', 'idx', []], ['HGMDDB', 'snps', 'idx', []], ['COSMIC', 'snps', 'idx', []], ['NSFP', 'nsfp', 'idx', []], ['SPLICE', 'splice', 'idx', []], ['MIRNA', 'mirna', 'idx', []], ['OMIM', 'omim', 'idx', []]] heads = ['variant_index', 'chrom', 'pos', 'ref', 'alt', 'rsid'] fpw = open(tsv, 'w') for dbindex in DBINDEX: heads.append('%s' % ('.'.join(dbindex[:-1]))) fpw.write('#%s\n' % (lib_utils.joined(heads, '\t'))) return fpw
def run_cmd(self, cmd, job_name=None): cmd_str = lib_utils.joined(cmd, ' ') lib_utils.msgout('notice', cmd_str) #debug self.logger.info('running [%s] ...' % cmd_str) if job_name: stdofp, stdefp = self.get_process_msg_handler(job_name) else: stdofp = sp.PIPE stdefp = sp.PIPE proc = sp.Popen(cmd_str, stdout=stdofp, stderr=stdefp, shell=True) retcode = proc.wait() if job_name: stdofp.close() stdefp.close() if retcode > 0: self.logger.error('[%s] failed' % cmd_str) raise RuntimeError('[%s] failed' % cmd_str) self.logger.info('done. [%s]' % job_name)
def gene_ontology_enrichment(self): ''' Objective:Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior) Input: -pheno_dmg = {gene1:0.2,gene2:0.9,...} #e.g. phenotype score -genetic_dmg = {gene2:0.4,gene3:0.3,...} #e.g. genetic score ''' job_name = 'gene_ontology_enrichment' msg = 'enriching perturbed genes with GO semantic similarity [%s] ...' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) # collect genes from both phenotype and genotype perturbation pgenes = list(self.pheno_dmg.keys()) P = len(pgenes) msg = 'total phenotypic genes before enrichment:%d' % P lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) ggenes = list(self.genetic_dmg.keys()) msg = 'total perturbed genes:%d' % len(ggenes) lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) # draw a venn diagram and get genes not reported by phenotypes among genetic dmg genes priv_ggenes = lib_utils.difference(pgenes, ggenes) msg = 'the number of genes not associated with the given phenotypes:%d' % len( priv_ggenes) lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) # to collect genes highly matched to do GO enrichment pgenes2 = self.get_GO_seeds(self.dm.seed_rate) #update self.go_seeds #query high-scored phenotype genes against private genetic-perturbed genes and bring high-matched ones msg = 'quering total [%d] seed phenotype genes into SQL ...' % len( pgenes2) lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) go = geneontology.Geneontology() goSimScores = go.get_funsim(pgenes2, priv_ggenes, min_score=self.dm.gosim_min) # updating the original phenotype damage score # weighting enriched phenotype matching score to the gene not reported in the original phenotypes pheno_delta = [] for pair, go_sc in goSimScores.iteritems(): #search for a gene matched to seed pheno gene if pair[0] not in self.pheno_dmg: gene_enriched = pair[0] seed_sc = self.pheno_dmg[pair[1]] elif pair[1] not in self.pheno_dmg: gene_enriched = pair[1] seed_sc = self.pheno_dmg[pair[0]] #initialize score if gene_enriched not in self.pheno_dmg: self.pheno_dmg[gene_enriched] = 0. #keep only maximum indirect_sc = seed_sc * go_sc * self.dm.go_penalty if indirect_sc > self.pheno_dmg[gene_enriched]: self.pheno_dmg[gene_enriched] = indirect_sc if gene_enriched not in pheno_delta: pheno_delta.append(gene_enriched) P_delta = len(self.pheno_dmg.keys()) - P msg = 'Total %d perturbed genes are added by phenotype gene enrichment!\ndone. [%s]' % ( P_delta, job_name) lib_utils.msgout('notice', msg) self.logger.info(msg) msg = 'genes enriched by GO:[%s]' % lib_utils.joined(pheno_delta, ',') lib_utils.msgout('notice', msg) self.logger.info(msg)
def reformat_to_lite(self, infile, vtype, outfile, min_cnt=1): jobname = "reformat_to_lite" msg = "working on vcf file [%s] ..." % infile print msg infoKeys = ['GENE', 'STRAND', 'CDS', 'AA', 'SNP'] v = vcf.VCFParser(infile) v.add_meta_info('COSMIC_ID', '.', 'String', 'cosmic ID') v.add_meta_info('REG', '2', 'Integer', '1:coding, 0:noncoding') if not os.path.exists(outfile): ostream = open(outfile, 'w') v.writeheader(ostream, to_del_info=infoKeys) else: ostream = open(outfile, 'a') pk0 = 'NA' cosmics = [] cnts = [] prev_rec = None for rec in v: v.parseinfo(rec) pk = lib_utils.joined([rec.chrom, rec.pos, rec.ref, rec.alt], '_') if pk != pk0: pk0 = pk if prev_rec: prev_rec.id = '.' prev_rec.info['COSMIC_ID'] = cosmics prev_rec.info['REG'] = vtype for info_key in infoKeys: v.delete_info(prev_rec, info_key) if vtype == NONCODING: prev_rec.info['CNT'] = '1' v.write(ostream, prev_rec) cosmics = [rec.id[0]] prev_rec = rec else: pk0 = pk cosmics.append(rec.id[0]) if prev_rec: prev_rec.id = '.' prev_rec.info['COSMIC_ID'] = cosmics prev_rec.info['REG'] = vtype for info_key in infoKeys: v.delete_info(prev_rec, info_key) if vtype == NONCODING: prev_rec.info['CNT'] = '1' v.write(ostream, prev_rec) ostream.close() v.stream.close() msg = "Done [%s]." % jobname print msg
def enrich_pheno_genes(self, ggenes): ''' Objective:Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior) Input: -pheno_dmg = {gene1:0.2,gene2:0.9,...} #e.g. phenotype score -genetic_dmg = {gene2:0.4,gene3:0.3,...} #e.g. genetic score ''' job_name = 'enrich_pheno_genes' msg = 'enriching perturbed genes with both GO semantic similarity and KEGG pathways [%s] ...' % job_name lib_utils.msgout('notice', msg); self.logger.info(msg) # collect genes from both phenotype and genotype perturbation pgenes = list(self.pheno_dmg.keys()) # assuming that it's score >0 P = len(pgenes) msg = 'total phenotypic genes before enrichment:%d' % P lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) msg = 'total perturbed genes:%d' % len(ggenes) lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) # draw a venn diagram and get genes not reported by phenotypes among genetic dmg genes priv_ggenes = lib_utils.difference(pgenes, ggenes) msg = 'the number of genes not associated with the given phenotypes:%d' % len(priv_ggenes) lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) # to collect genes highly matched to do GO enrichment # Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior) seed_pheno_genes, seed_scores, _ = \ self.get_seed_genes(self.dm.go_seed_k) # query high-scored phenotype genes against private genetic-perturbed genes and bring high-matched ones msg = 'Using [%d] seed genes to enrich [%d] genetic variant genes with GO similarity ...' % (len(seed_pheno_genes),len(priv_ggenes)) lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) go = geneontology.Geneontology() goSimScores = go.get_funsim(seed_pheno_genes, priv_ggenes, min_score=self.dm.gosim_min) msg = 'Using [%d] seed genes to enrich [%d] genetic variant genes with KEGG similarity ...' % (len(seed_pheno_genes), len(priv_ggenes)) lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) # updating the original phenotype damage score # weighting enriched phenotype matching score to the gene not reported in the original phenotypes delta_pheno = {} if goSimScores: msg = 'Collecting [%d] GO enriched genes, enrichment_penality_ratio [%g] ...' % (len(goSimScores),self.dm.go_penalty) lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) for pair, go_sc in goSimScores.iteritems(): # search for a gene matched to seed pheno gene if pair[0] in priv_ggenes: new_gene = pair[0] seed_gene = pair[1] else: new_gene = pair[1] seed_gene = pair[0] score2 = go_sc * self.dm.go_penalty * self.pheno_dmg[seed_gene].score if score2 > 0.: # register enriched genes if new_gene not in delta_pheno: delta_pheno[new_gene] = lib_utils.py_struct(go=[0., None, None], kegg=[0.,0.], score=0.) delta_pheno[new_gene].go[2] = self.pheno_dmg[seed_gene].disId if score2 > delta_pheno[new_gene].go[0]: #keep only max score delta_pheno[new_gene].go[0] = score2 delta_pheno[new_gene].go[1] = seed_gene delta_pheno[new_gene].go[2] = self.pheno_dmg[seed_gene].disId delta_pheno[new_gene].score = delta_pheno[new_gene].go[0] msg = 'Genes enriched by GO similarity:[%s]' % lib_utils.joined(delta_pheno.keys(), ',') lib_utils.msgout('notice', msg) self.logger.info(msg) seed_pheno_genes, seed_scores, mean_seed_score = \ self.get_seed_genes(self.dm.go_seed_k * 4) # update self.go_seeds msg = 'Using [%d] seed genes to enrich [%d] genetic variant genes with KEGG pathway genes ...' % ( len(seed_pheno_genes), len(priv_ggenes)) lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) # query seed_pheno_genes to KEGG matrix and normalize the matched genes and ranking! keggEnriched = run_bp(seed_pheno_genes, seed_scores, priv_ggenes, kegg_genes_fn=self.entries['kegg_hsa']) if keggEnriched: msg = 'Collecting [%d] KEGG enriched genes with mean seed score [%g]...' % (len(keggEnriched),mean_seed_score) lib_utils.msgout('notice', msg, job_name); self.logger.info(msg) for kgene, kscore in keggEnriched.iteritems(): # search for a gene matched to seed pheno gene score2 = kscore * mean_seed_score if score2 > 0.: # register enriched genes if kgene not in delta_pheno: delta_pheno[kgene] = lib_utils.py_struct(go=[0., None, None], kegg=[0], score=0.) if score2 > delta_pheno[kgene].kegg[0]: #keep only max score and sum two enriched scores delta_pheno[kgene].kegg[0] = score2 delta_pheno[kgene].score = delta_pheno[kgene].go[0] + delta_pheno[kgene].kegg[0] msg = 'Genes enriched by KEGG bipartite network difussion:[%s]' % lib_utils.joined(keggEnriched.keys(), ',') lib_utils.msgout('notice', msg) self.logger.info(msg) max_score = -1. max_seed_gene = None msg = 'Total [%d] mutated genes that did not have any phenotype score previously are enriched. Assigning a new phenotype score to each enriched gene ...' % len(delta_pheno) lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) if delta_pheno: for gene, deltaP in delta_pheno.iteritems(): if deltaP.score > max_score: max_score = deltaP.score if deltaP.go[1]: max_seed_gene = deltaP.go[1] if max_score > 0: if max_seed_gene: max_enriched_score = self.pheno_dmg[max_seed_gene].score else: max_seed_gene = self.get_max_pheno_dmg() max_enriched_score = self.pheno_dmg[max_seed_gene].score max_scaled = max_enriched_score * self.dm.go_penalty * 2 for ngene,deltaP in delta_pheno.iteritems(): self.pheno_dmg[ngene] = PhenoGene() self.pheno_dmg[ngene].score = delta_pheno[ngene].score*max_scaled/max_score self.pheno_dmg[ngene].disId = deltaP.go[2] if deltaP.go[2]: self.omim.cDis[deltaP.go[2]].enriched_genes[ngene] = None if deltaP.go[1]: self.omim.cDis[deltaP.go[2]].enriched_genes[ngene] = deltaP.go[1] msg = 'max scaled phenotype score[%g], raw max enriched score[%g]' % (max_scaled,max_score) lib_utils.msgout('notice', msg, job_name) self.logger.info(msg)
def create_disease_rank_tab(self): fpw = open(self.dv.disease_rank_fn, 'w') headStr = """ disease_ID disease_description inheritance assoc_pheno_genes(^:mutated,*:known_pathogenic) num_of_assoc_pheno_genes num_of_gt_dmg_genes pheno_match_score avg_combined_dmg_score max_combined_dmg_score avg_harmonic_score max_harmonic_score external_genes_of_interest(kegg-ppi_or_GO_enriched[harmonic_score]) PPI-KEGG_pathway_desc """ headCols = headStr.split() cell_delim = ';' fpw.write('#%s\n' % lib_utils.joined(headCols, '\t')) # cKegg = kegg_pathway.Kegg(hsa_fn=self.dv.entries['kegg_hsa']) cKegg.get_hsa() #annotate kegg_pathway to disease self.dv.omim.to_kegg_hsa(cKegg.cHsa) #browsing whole known disease entries whose HPO sim score with the patient > 0. for cD in self.dv.omim.cDis.itervalues(): if cD.pheno_score == 0.: continue to_print = [] to_print.append(cD.id) #disID to_print.append(cD.desc) #disDesc to_print.append(divine_inc.inheritStr[cD.inherit]) #inheritance Genes = [[], []] max_rw_score = [0., 0.] sum_rw_score = [0., 0.] cnt_gene_dmg = 0 gt2_dmg = None for gene in cD.genes: #for each gene assoc with the disease #split into two groups (one having gt_dmg, or else), and collect max & sum act score if gene in self.dv.gt_dmg: if self.dv.vknown and gene in self.dv.vknown_genes: Genes[0].append('%s*' % gene) else: Genes[0].append('%s^' % gene) if self.dv.gene_dmg[gene][0] > max_rw_score[0]: max_rw_score[0] = self.dv.gene_dmg[gene][0] sum_rw_score[0] += self.dv.gene_dmg[gene][0] else: Genes[1].append(gene) #to collect max & sum on harmonic scores if gene in self.dv.gene_dmg: if self.dv.gene_dmg[gene][1] > max_rw_score[1]: max_rw_score[1] = self.dv.gene_dmg[gene][1] sum_rw_score[1] += self.dv.gene_dmg[gene][1] cnt_gene_dmg += 1 #bring KEGG genes (PPI) interacted with non-mutated phenotype genes goi,hsa_desc = self.external_goi(\ Genes[1],cD.kegg_hsa,cKegg.cHsa) #bring GO enriched genes for gene2 in cD.enriched_genes: geneStr2 = 'go(%s:%s' % (cD.enriched_genes[gene2], gene2) if self.dv.vknown and (gene2 in self.dv.vknown_genes): geneStr2 = geneStr2 + '*' else: geneStr2 = geneStr2 + '^' if gene2 in self.dv.gene_dmg: goi.append('%s[%g])' % \ (geneStr2,self.dv.gene_dmg[gene2][1])) to_print.append(cell_delim.join(Genes[0] + Genes[1])) #assoc_pheno_genes G = len(cD.genes) G_mt = len(Genes[0]) to_print.append(G) #num_of_assoc_pheno_genes to_print.append(G_mt) #num_of_gt_dmg_genes to_print.append(cD.pheno_score) #pheno_match_score if G_mt > 0: to_print.append(sum_rw_score[0] / G_mt) #avg_combined_dmg_score else: to_print.append(0.) to_print.append(max_rw_score[0]) #max_combined_dmg_score if cnt_gene_dmg > 0: to_print.append(sum_rw_score[1] / cnt_gene_dmg) #avg_harmonic_score to_print.append(max_rw_score[1]) #max_harmonic_score else: to_print.append(0.) to_print.append(0.) to_print.append( cell_delim.join(goi)) #partner_in_protein_network_of_interest if hsa_desc: to_print.append( cell_delim.join(hsa_desc)) #kegg-pathway desc if exist else: to_print.append('NA') #kegg-pathway desc if exist fpw.write('%s\n' % (lib_utils.joined(to_print, '\t'))) fpw.close()