def collapse_bed(self,tmp_bed,job_name,ext_bp): msg = 'sorting bed file ... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp) # sort lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True, tmp_so_bed) msg = 'merging exon coordinates overlapped each other... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) # merge boundaries if any overlapped fp = open(tmp_so_bed, 'r') fp2 = open(self.bed_fn, 'w') chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t') e1p = int(e1p) e2p = int(e2p) wrapup = 1; merge = 2 fp.seek(0) for i in fp: chrom, e1, e2, annot = i.rstrip().split('\t') e1 = int(e1) e2 = int(e2) if chrom == chromp: if e2p < e1: action = wrapup else: action = merge else: action = wrapup if action == wrapup: fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) chromp, e1p, e2p, annotp = chrom, e1, e2, annot elif action == merge: if e2p < e2: e2p = e2 annotp += '|%s' % annot fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) fp.close() fp2.close() os.unlink(tmp_so_bed) msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg)
def gdna_to_vcf(self, mutalyzer_batch_outfn): if not os.path.exists(mutalyzer_batch_outfn): raise RuntimeError('check if input file [%s] exists'%\ mutalyzer_batch_outfn) cHgvs = Hgvs2() cHgvs.load_resource() fp = open(mutalyzer_batch_outfn, 'r') fp.next() gdna_cache = {} for mutalyzer in fp: mut = mutalyzer.split('\t') if mut[1].strip(): continue gdna = mut[2].strip() variants = cHgvs.gdna_to_vcf(gdna) if variants: gdna_cache[mut[0].strip()] = variants fp.close() self.out_vcf = lib_utils.file_tag(self.tsv, None, 'vcf') tmp_vcf = self.out_vcf + '.tmp' fpw = open(tmp_vcf, 'w') self._write_vcf_head(fpw) qual = 100 filter = 'PASS' rsid = '.' for cvt in self._iterfile(): if self.may_pass(cvt): continue if cvt.nt_change not in gdna_cache: continue for chrom, pos, ref, alt in gdna_cache[cvt.nt_change]: if len(ref) > 100 or len(alt) > 100: continue info = 'cDNA=%s;' % cvt.nt_change info += 'VC=%s;' % self.determine_vclass(cvt.rep_class) info += 'SRC=%s;' % cvt.source info += 'UPD=%s;' % cvt.last_upd info += 'URL=%s' % cvt.url if chrom.startswith('chr'): if chrom.startswith('chrM'): chrom = 'MT' else: chrom = chrom[3:] cols = [chrom, pos, rsid, ref, alt, qual, filter, info] fpw.write('%s\n' % lib_utils.joined(cols, '\t')) fpw.close() lib_utils.sort_tsv_by_col2(tmp_vcf,[1,2],\ ['V','n'],False,self.out_vcf) os.unlink(tmp_vcf)
def rank_pheno_gene(self): job_name = 'rank_pheno_gene' msg = 'selecting genes matched by patient phenotypes ... [%s;%s]'%(job_name,self.hpo_query) lib_utils.msgout('notice',msg); self.logger.info(msg) tmp_fn = '%s.tmp' % self.gene_rank_fn fp2=open(tmp_fn,'w') fp2.write('#gene\tphenotypic_score\n') for gene,cPhenoGene in self.pheno_dmg.iteritems(): fp2.write('%s\t%g\n'%(gene,cPhenoGene.score)) fp2.close() lib_utils.sort_tsv_by_col2(tmp_fn,[2],['gr'],False,self.gene_rank_fn) msg = 'done. [%s]'%job_name os.unlink(tmp_fn) lib_utils.msgout('notice',msg); self.logger.info(msg)
def convert_node2gene(FinalNodeScores, PerturbedGenes, dProtein2gene, lnkProteins, rank_fn): nodeScores, dangledScores = FinalNodeScores cPerturbedGenes, dangledGenes = PerturbedGenes rank_fn2 = lib_utils.file_tag2(rank_fn, 'tmp', None) fp2 = lib_utils.open2(rank_fn2, 'w') fp2.write('#gene\tpredicted_score[-1/log10(x)]\tseed_score\n') for n, protein in enumerate(lnkProteins): seed_score = 0. gene = protein genetic_dmg_score = 0. if protein in dProtein2gene: gene = dProtein2gene[protein] if gene in cPerturbedGenes: seed_score = cPerturbedGenes[gene].score genetic_dmg_score = cPerturbedGenes[gene].gdmg pred_score = 0. if nodeScores[n] > 0: pred_score = -1. / math.log10(nodeScores[n]) if genetic_dmg_score > 0.: fp2.write('%s\t%g\t%g\n' % (gene, pred_score, seed_score)) #add dangled node score for n, gene in enumerate(dangledGenes): pred_score = 0. if dangledScores[n] > 0: pred_score = -1. / math.log10(dangledScores[n]) if cPerturbedGenes[gene].gdmg > 0.: fp2.write('%s\t%g\t%g\n' % (gene, pred_score, cPerturbedGenes[gene].score)) fp2.close() #sort by score lib_utils.sort_tsv_by_col2(rank_fn2, [2], ['gr'], False, rank_fn) os.unlink(rank_fn2)
def _preproces(self, min_go_score=0.5): #to get temp dir tmpD = os.path.join(os.path.dirname(self.inname), 'tmp') if not os.path.exists(tmpD): os.makedirs(tmpD) self.uniprot_to_gene() inname_g = self.inname + '.gene' fp2 = open(inname_g, 'w') #convert uniprot to hgnc for entry in self._iterfile( ): #uniprot1, uniprot2, score=(BP+MF+CC)/3, denominator (1<=x<=3) if float(entry.score) >= min_go_score: gene1 = self.uni2gene[entry.prod1] gene2 = self.uni2gene[entry.prod2] fp2.write('%s\t%s\t%s\n'%('\t'.join(sorted([gene1,gene2])),\ entry.score,entry.denominator)) fp2.close() #sorting inname_gso = inname_g + '.so' lib_utils.sort_tsv_by_col2(inname_g, [1, 2, 3], ['V', 'V', 'g'], False, inname_gso, tmpD) #temp dir #take max funSim for each pair fp = open(inname_gso, 'r') j = fp.next().strip().split('\t') prev_pair = '%s\t%s' % (j[0], j[1]) mx_score1 = 0. mx_denom1 = 0 mx_score2 = 0. mx_denom2 = 0 fp.close() self.inname = inname_gso fp2 = open(inname_g, 'w') for entry in self._iterfile(): pair = '%s\t%s' % (entry.prod1, entry.prod2) if pair != prev_pair: #wraup prev if mx_denom2 > 0: fp2.write('%s\t%g\t%d\n' % (prev_pair, mx_score2, mx_denom2)) elif mx_denom1 > 0: fp2.write('%s\t%g\t%d\n' % (prev_pair, mx_score1, mx_denom1)) prev_pair = pair mx_score1 = 0. mx_denom1 = 0 mx_score2 = 0. mx_denom2 = 0 score = float(entry.score) denominator = int(entry.denominator) if denominator > 1: if score > mx_score2: mx_score2 = score mx_denom2 = denominator else: if score > mx_score1: mx_score1 = score mx_denom1 = denominator #don't forget the last entries if mx_denom2 > 0: fp2.write('%s\t%g\t%d\n' % (prev_pair, mx_score2, mx_denom2)) elif mx_denom1 > 0: fp2.write('%s\t%g\t%d\n' % (prev_pair, mx_score1, mx_denom1)) fp2.close() self.inname = inname_g os.unlink(inname_gso) os.system('rm -rf %s' % tmpD)
def convert_node2gene(self): ''' for only gt_dmg genes, print out gene, harmonic score, and seed score ''' rank_fn_tmp = '%s.tmp' % self.dv.gene_rank_fn fp2 = lib_utils.open2(rank_fn_tmp, 'w') fp2.write( '#gene\tpredicted_score\tseed_score\tgt_dmg_score\tpheno_score\tcontain_known_pathogenic\n' ) genes_printed = {} #browsing each node in the whole (original) ppi network for n, protein in enumerate(self.Prots): seed_score = 0. gene = protein #check if this node (restart value) was assigned previously if protein in self.dProt2Gene: gene = self.dProt2Gene[protein] if gene in self.dv.gene_dmg: seed_score = self.dv.gene_dmg[gene][0] #to get harmonic score and save into dv.gene_dmg pred_score = 0. if self.harmonic_sc[n][0] > 0.: pred_score = self.harmonic_sc[n][0] if gene in self.dv.gene_dmg: self.dv.gene_dmg[gene][1] = pred_score #NOTE that print only a gene having at one mutation if (not self.dv.gt_dmg) or \ (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.): pheno_sc = 0. if gene in self.dv.pheno_dmg: pheno_sc = self.dv.pheno_dmg[gene].score if self.dv.vknown: if gene in self.dv.vknown_genes: is_vknown = 'Y' else: is_vknown = 'N' else: is_vknown = 'NA' if gene in genes_printed: gene2 = '%s|%s' % (gene, protein) else: gene2 = gene genes_printed[gene] = True fp2.write('%s\t%g\t%g\t%g\t%g\t%s\n'%\ (gene2,pred_score,seed_score,\ self.dv.gt_dmg[gene].score,pheno_sc,is_vknown)) #repeat the same procedure to dangled nodes for n, gene in enumerate(self.dangledGenes): self.dv.gene_dmg[gene][1] = self.harmonic_dng_sc[n][0] if (not self.dv.gt_dmg) or \ (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.): pheno_sc = 0. if gene in self.dv.pheno_dmg: pheno_sc = self.dv.pheno_dmg[gene].score if self.dv.vknown: if gene in self.dv.vknown_genes: is_vknown = 'Y' else: is_vknown = 'N' else: is_vknown = 'NA' fp2.write('%s\t%g\t%g\t%s\t%g\t%s\n'%\ (gene,self.dv.gene_dmg[gene][1],self.dv.gene_dmg[gene][0],\ self.dv.gt_dmg[gene].score,pheno_sc,is_vknown)) fp2.close() #sort by score lib_utils.sort_tsv_by_col2(\ rank_fn_tmp, [2], ['gr'], False, self.dv.gene_rank_fn) os.unlink(rank_fn_tmp)
def create_bed(self, ext_bp=0, reuse=False): job_name = 'RefGeneUcscTB.create_bed' self.bed_fn = os.path.join(self.work_dir, 'refGene_e%d_so_merged.bed' % ext_bp) msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s' % ( self.bed_fn, job_name) lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) if reuse and lib_utils.check_if_file_valid(self.bed_fn): msg = 'reuse bed file [%s] generated previously @ %s' % ( self.bed_fn, job_name) lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) return self.bed_fn #to get a working directory tmp_bed = os.path.join(self.work_dir, 'refGene_e%d.bed' % ext_bp) fp = open(self.refGene_fn, 'r') fp2 = open(tmp_bed, 'w') for i in fp: j = i.rstrip().split('\t') chrom = j[2] for e1, e2 in zip(j[9].split(',')[:-1], j[10].split(',')[:-1]): e1_ext = int(e1) - ext_bp e2_ext = int(e2) + ext_bp fp2.write('%s\t%d\t%d\t%s;%s\n' % (chrom, e1_ext, e2_ext, j[12], j[1])) fp2.close() fp.close() msg = 'sorting bed file ... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp) #sort lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True, tmp_so_bed) msg = 'merging exon coordinates overlapped each other... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) #merge boundaries if any overlapped fp = open(tmp_so_bed, 'r') fp2 = open(self.bed_fn, 'w') chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t') e1p = int(e1p) e2p = int(e2p) wrapup = 1 merge = 2 fp.seek(0) for i in fp: chrom, e1, e2, annot = i.rstrip().split('\t') e1 = int(e1) e2 = int(e2) if chrom == chromp: if e2p < e1: action = wrapup else: action = merge else: action = wrapup if action == wrapup: fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) chromp, e1p, e2p, annotp = chrom, e1, e2, annot elif action == merge: if e2p < e2: e2p = e2 annotp += '|%s' % annot fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) fp.close() fp2.close() os.unlink(tmp_bed) os.unlink(tmp_so_bed) msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) return self.bed_fn
def reformat_go_sim_fns(go_sim_fns, out_fn, method_id=1): #method_id (1) means SimRel suflabs = ['BP', 'MF', 'CC'] fp2 = lib_utils.open2(out_fn, 'w') v = 0 for key, go_sim_fn in go_sim_fns.iteritems(): #BP,MF,CC print 'appending root node at the end of [%s]' % go_sim_fn suflab = suflabs[v] fp = lib_utils.open2(go_sim_fn, 'r') go_sim_fn2 = lib_utils.file_tag2(go_sim_fn, 'category', None) fp.next() #strip off head for i in fp: j = i.rstrip().split('\t') if len(j) == 2: j.append('-1.') fp2.write('%s\t%s\n' % ('\t'.join(j), suflab)) #uniprot1,uniprot2,score,BP fp.close() print 'done.' v += 1 fp2.close() print 'sorting...' #to get temporary file to sort out_fn2 = lib_utils.file_tag2(out_fn, 'sort', None) temp_sort_dir, _, _, _ = lib_utils.separateDirFn2(out_fn) lib_utils.sort_tsv_by_col2(out_fn, [1, 2, 4], ['V', 'V', 'V'], False, out_fn2, temp_dir=temp_sort_dir) os.rename(out_fn2, out_fn) print 'done.' #groupping print 'collapsing GO sim scores to make the format easier to import SQL [%s] ...' % out_fn out_fn2 = lib_utils.file_tag2(out_fn, 'dense', None) fp2 = lib_utils.open2(out_fn2, 'w') #heads = '#uniprot1\tuniprot2\tscore_mode\tBP\tMF\tCC\tmethod_id' #fp2.write('%s\n'%heads) fp = lib_utils.open2(out_fn, 'r') visit1 = True idx = {'BP': 0, 'MF': 1, 'CC': 2} prev_key = None gosim_holder = ['-1', '-1', '-1'] #-1 means N/A for i in fp: prot1, prot2, score, go_class = i.rstrip().split('\t') key = '%s\t%s' % (prot1, prot2) if key != prev_key: if visit1: visit1 = False else: #wrap up fp2.write('%s\t%s\t%d\n' % (prev_key, '\t'.join(gosim_holder), method_id)) gosim_holder = ['-1', '-1', '-1'] gosim_holder[idx[go_class]] = score prev_key = key else: #keep storing values gosim_holder[idx[go_class]] = score fp.close() #don't forget the last entry fp2.write('%s\t%s\t%d\n' % (prev_key, '\t'.join(gosim_holder), method_id)) fp2.close() os.rename(out_fn2, out_fn) print 'done.'