def record_commandline(self): ''' objective: record the divine run condition into logger ''' import socket job_name = 'record_commandline' msg='capturing user command line [%s] ...'%job_name lib_utils.msgout('notice',msg);self.logger.info(msg) try: host_name = socket.gethostname() except: host_name = 'N/A' self.logger.info('host:%s'%host_name) try: user = os.environ.get('USER') except: user = '******' self.logger.info('user:%s'%user) try: pwd = os.environ.get('PWD') except: pwd = 'N/A' self.logger.info('pwd:%s'%pwd) self.logger.info('cmd:%s'%(' '.join(sys.argv))) self.logger.info("divine configuration file:%s" % self.config_fn) self.logger.info('exclude_non_coding:%s'%self.excl_non_coding) msg = 'done. [%s]' % job_name lib_utils.msgout('notice',msg);self.logger.info(msg)
def run_vcf2xls(self): job_name = 'run_vcf2xls' msg = 'converting vcf file to excel file [%s] ...' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) rank_fn_tmp = self.rank_fn + '.tmp' cmd = [ "cut", "-f1,2", self.rank_fn, "|", "grep", "-v", "'#'", ">", rank_fn_tmp ] self.run_cmd(cmd, "extract_pred_rank") self.xls = self._assign_out_fn('divine', 'xls') cmd = ["python", self.entries['vcf2xls'], \ "-i", self.vcf, \ "-o", self.xls, \ "-l", self.log_dir, \ "-g", rank_fn_tmp] self.run_cmd(cmd, job_name) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) os.unlink(rank_fn_tmp)
def hpo_to_diseases(self): ''' objective: match HPO IDs from a given patint phenotype to known disease database input: hpo_query, hpo database method: system call output: phenotype matching score w.r.t disease ''' job_name = 'hpo_to_diseases' # prepare output file self.hpo2disease_fn = self._assign_out_fn(job_name, 'tsv') msg = 'matching query phenotypes to diseases in semantic HPO ontology[%s;%s]' % ( job_name, self.hpo2disease_fn) lib_utils.msgout('notice', msg) self.logger.info(msg) # run hpo similarity cmd = ["python", self.entries['hposim'], \ "-q", self.hpo_query, \ "-b", self.entries['hpo_obo'], \ "-f", self.entries['ext_disease_to_gene'], \ "--normalize", \ "-o", self.hpo2disease_fn] self.run_cmd(cmd, job_name) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
def run_vcf2xls(self): job_name = 'run_vcf2xls' msg = 'converting vcf file to excel file [%s] ...'%job_name lib_utils.msgout('notice',msg); self.logger.info(msg) if not os.path.exists(self.gene_rank_fn): msg = "check if gene rank file [%s] exist"%self.gene_rank_fn print(msg); self.logger(msg); RuntimeError(msg) rank_fn_tmp = self.gene_rank_fn + '.tmp' cmd = ["cut","-f1,2",self.gene_rank_fn,"|","grep","-v","'#'",">",rank_fn_tmp] lib_utils.runcmd2(cmd,self.log_dir,self.logger,"extract_pred_rank") self.xls = self._assign_out_fn('divine','xls') cmd = ["python", self.entries['vcf2xls'], \ "-i", self.vcf, \ "-o", self.xls, \ "-l", self.log_dir, \ "-g", rank_fn_tmp, \ "-k", self.vknown] lib_utils.runcmd2(cmd,self.log_dir,self.logger,job_name) msg = 'done. [%s]'%job_name lib_utils.msgout('notice',msg); self.logger.info(msg) os.unlink(rank_fn_tmp)
def norm_pheno_dmg(self): msg = 'normalizing phenogenes by sum ...' lib_utils.msgout('notice', msg); self.logger.info(msg) gt_dmg = pd.DataFrame({ 'gene':self.gt_dmg.keys(), 'score':[gt.score for gt in self.gt_dmg.itervalues()]} ) pn_dmg = pd.DataFrame({ 'gene':self.pheno_dmg.keys(), 'pheno_score':[pn.score for pn in self.pheno_dmg.itervalues()]} ) gt_dmg = pd.merge(gt_dmg, pn_dmg, how='left', on='gene') gt_dmg.loc[gt_dmg.pheno_score.isna(), 'pheno_score'] = \ gt_dmg.pheno_score.min() * self.dm.min_dmg_prior gt_dmg.pheno_score /= gt_dmg.pheno_score.sum() for r in gt_dmg.itertuples(): self.gt_dmg[r.gene].pheno_score = r.pheno_score if r.gene in self.pheno_dmg: self.pheno_dmg[r.gene].score = r.pheno_score for pgene in self.pheno_dmg: if not any(gt_dmg.gene == pgene): self.pheno_dmg[pgene] = None msg += ', done.' lib_utils.msgout('notice', msg); self.logger.info(msg)
def gather_pdomain_scores(self, vcfParser): msg = 'gathering pathogenic variant density in domains ...' lib_utils.msgout('notice', msg) self.logger.info(msg) pdomains = lib_utils.py_struct(ridx=[], denoms=[], benign_dens=[], vus_dens=[], patho_dens=[]) ridx = 0 for rec in vcfParser: vcfParser.parseinfo(rec) # to collect pdomain info if rec.info.PATHO_DOMAIN: pdoms = [float(pdom) for pdom in rec.info.PATHO_DOMAIN.split(',')] pdomains.ridx.append(ridx) pdomains.denoms.append(pdoms[0]) pdomains.benign_dens.append(pdoms[1]) pdomains.vus_dens.append(pdoms[2]) pdomains.patho_dens.append(pdoms[3]) ridx += 1 pdomains = pd.DataFrame({'ridx': pdomains.ridx, 'denoms': pdomains.denoms, 'benign_dens': pdomains.benign_dens, 'vus_dens': pdomains.vus_dens, 'patho_dens': pdomains.patho_dens, 'phat_lo':None, 'patho_dens_p':None}) phat = pdomains.patho_dens / (pdomains.benign_dens + pdomains.vus_dens + pdomains.patho_dens) tgt_z = damaging_model.get_z(confidence=0.75) pdomains['phat_lo'] = map(lambda x1,x2: damaging_model.ci_lower_bound(x1, x2, z=tgt_z), phat, pdomains.denoms) tgt_pctile = 50 pdensl = np.log10(pdomains.patho_dens+1e-12) tgt_pctile_sc = np.percentile(pdensl, tgt_pctile) y = (pdensl >= tgt_pctile_sc).astype(np.float) X = pdensl[:, np.newaxis] model2 = LogisticRegression().fit(X, y) pdomains['patho_dens_p'] = model2.predict_proba(X)[:, 1] pdomains_default = lib_utils.py_struct(phat_lo=np.percentile(pdomains['phat_lo'], 15), patho_dens_p=np.percentile(pdomains['patho_dens_p'], 15)) return pdomains, pdomains_default
def _read_config(self,vcf_filter_cfg=None): ''' objective: read configuration file ''' job_name = '_read_config' msg = 'reading configuration file [%s;%s] ...'%(job_name,self.config_fn) lib_utils.msgout('notice',msg);self.logger.info(msg) self.sparser.read(self.config_fn) self._set_config('program_paths', 'varant') self._set_config('program_paths', 'hposim') self._set_config('program_paths', 'vcf2xls') self._set_config('config', 'temp_dir') if not vcf_filter_cfg: self._set_config('config', 'vcf_filter_conf') else: if os.path.exists(vcf_filter_cfg): self.entries['vcf_filter_conf'] = vcf_filter_cfg else: raise RuntimeError('check if the file [%s] is valid'%vcf_filter_cfg) self._set_config('database', 'ext_disease_to_gene') self._set_config('database', 'disease_desc') self._set_config('database', 'hpo_obo') self._set_config('database', 'beta_fit') self._set_config('database', 'string_link') ''' to access to UCSC mysql database(hg19) select e2g.value, gtp.protein from ensGtp as gtp inner join ensemblToGeneName as e2g on e2g.name=gtp.transcript; ''' self._set_config('database', 'esp_to_gene') self._set_config('database', 'kegg_hsa') # check if the file or directory all exists before long journey! for key, path2 in self.entries.iteritems(): if not lib_utils.check_if_file_valid(path2): raise IOError('check [%s = %s] in the file [%s]' %\ (key, path2, self.config_fn)) msg = 'done. [%s]' % job_name lib_utils.msgout('notice',msg);self.logger.info(msg) return self.entries
def disease_to_genes(self): ''' objective: from hit scores of query hpo to disease, associate disease to genes input: -hpo2disease_fn: a file generated by hpo_to_disease() '#omim\tgenes\tscore\n' -gene_norm: want to normalize accumulated phenotype score per gene? [True] ''' job_name = 'disease_to_genes' if self.hpo2disease_fn is None: self.hpo_to_diseases() msg = 'aggregating HPO hit scores of disease to each gene [%s;%s]...' % \ (job_name,self.hpo2disease_fn) lib_utils.msgout('notice', msg) self.logger.info(msg) fp = lib_utils.open2(self.hpo2disease_fn, 'r') #accumulating phenotye-matching score into genes associated with the disease pheno_genes = {} for i in fp: # for each disease if i.startswith('#'): continue i = i.rstrip() omim, geneStr, funsimMatAvg = i.rstrip().split('\t') genes = geneStr.split(',') funsimMatAvg = float(funsimMatAvg) for gene in genes: # for each gene if funsimMatAvg > 0.: if gene not in pheno_genes: pheno_genes[gene] = 0. if funsimMatAvg > pheno_genes[gene]: #keep only maximum pheno_genes[gene] = funsimMatAvg fp.close() self.pheno_dmg = lib_utils.normalize_dic(pheno_genes, 'sum') #print phenotypic damage scores self.rank_pheno_gene() msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) #clean up variables pheno_genes = None
def vannotate(self, reuse=False): ''' objective: run varant (GCN) annotator input: self.vcf output: annotated vcf ''' job_name = 'vannotate' msg = 'annotating VCF file[%s;%s] ...' % (job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) # prepare output file varant_vcf = os.path.join(self.out_dir, 'divine.vcf') # if necessary, masking the raw vcf file coding_vcf = None if self.ref_exon_only > 0: if not lib_utils.check_if_file_valid(varant_vcf) or not reuse: cRef = annotateRegion.RefGeneUcscTB(work_dir=self.out_dir, logger=self.logger) coding_bed_fn = cRef.create_bed(ext_bp=20, reuse=False) msg = 'extracting variants in coding region from [%s] @ %s ...' % ( self.vcf, job_name) lib_utils.msgout('notice', msg) self.logger.info(msg) coding_vcf = os.path.join(self.out_dir, 'refgene_e20.vcf') self.vcf = vcf_mask.by_bed(self.vcf, coding_bed_fn, coding_vcf, logger=self.logger) msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) if not lib_utils.check_if_file_valid(varant_vcf) or not reuse: self.logger.info('annotating [%s,%s] ...' % (job_name, self.vcf)) cmd = ["python", self.entries['varant'], \ "-i", self.vcf, \ "-o", varant_vcf, \ "-l", self.log_dir] if self.capkit: cmd.extend(["-c", self.capkit, "-e", "180"]) if self.hgmd > 0: cmd.extend(["--hgmd"]) self.run_cmd(cmd, job_name) self.vcf = varant_vcf if coding_vcf: os.unlink(coding_vcf) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
def rank_pheno_gene(self): job_name = 'rank_pheno_gene' msg = 'selecting genes matched by patient phenotypes ... [%s;%s]'%(job_name,self.hpo_query) lib_utils.msgout('notice',msg); self.logger.info(msg) tmp_fn = '%s.tmp' % self.gene_rank_fn fp2=open(tmp_fn,'w') fp2.write('#gene\tphenotypic_score\n') for gene,cPhenoGene in self.pheno_dmg.iteritems(): fp2.write('%s\t%g\n'%(gene,cPhenoGene.score)) fp2.close() lib_utils.sort_tsv_by_col2(tmp_fn,[2],['gr'],False,self.gene_rank_fn) msg = 'done. [%s]'%job_name os.unlink(tmp_fn) lib_utils.msgout('notice',msg); self.logger.info(msg)
def run(self, vcf_fn, masked_vcf_fn): job_name = 'BedMaskingVCF.run' cRef = annotateRegion.RefGeneUcscTB(logger=self.logger) cRef.bed_fn = self.bed eBnds = cRef.get_boundary() msg = 'masking the vcf file [%s] by the bed file [%s] @ %s' % ( vcf_fn, cRef.bed_fn, job_name) lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) fp = open(vcf_fn, 'r') fp2 = open(masked_vcf_fn, 'w') cmd_head = '##original_vcf=%s\n##masking_bed=%s' % (vcf_fn, self.bed) head_written = False for i in fp: if i[0] == '#': if i.startswith('##contig'): if not head_written: fp2.write('%s\n' % cmd_head) head_written = True fp2.write('%s' % i) else: j = i.split('\t') chrom = j[0] chrom = to_ucsc_chrom(chrom) if chrom in eBnds: pos1 = int(j[1]) - 1 #adjust VCF to BED coordinate idx = np.nonzero((eBnds[chrom][:, 0] <= pos1) & (pos1 < eBnds[chrom][:, 1])) if idx[0].size > 0: fp2.write('%s' % i) else: pos2 = pos1 + vcf_get_max_sv_len(j[3], j[4]) if pos2 > pos1: idx = np.nonzero((eBnds[chrom][:, 0] <= pos2) & (pos2 < eBnds[chrom][:, 1])) if idx[0].size > 0: fp2.write('%s' % i) fp.close() fp2.close() msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg)
def heat_diffusion_core(self,gamma=2.,M=100,alpha=0.9,\ maxIter=150,logger=None): job_name = 'pagerank|heat_diffusion_core' N = len(self.Y) s = np.zeros(shape=(N, 1)) N0 = len(self.Y0) s0 = np.zeros(shape=(N0, 1)) epsilon = 1e-4 iter = 0 msg = 'running heat diffusion on [%dx%d, gamma=%g, alpha=%g, max_iter=%d, M=%d]. Please, be patient ...' % ( N, N, gamma, alpha, maxIter, M) lib_utils.msgout('notice', msg, job_name) if logger: logger.info(msg) e = 1. while (e >= epsilon and iter < maxIter): #heat diffusion s_new = (1. - gamma / M) * s + (gamma / M) * (alpha * self.A.dot(s) + (1. - alpha) * self.Y) s0_new = (1. - gamma / M) * s0 + (gamma / M) * (1. - alpha) * self.Y0 #normalize denom = np.sum(s_new) + np.sum(s0_new) s_new = s_new / denom s0_new = s0_new / denom e = cal_array_distance(s_new, s) + cal_array_distance(s0_new, s0) s = np.copy(s_new) s0 = np.copy(s0_new) iter += 1 msg = 'done. [iteration:%d/%d,e:%g]' % (iter, maxIter, e) lib_utils.msgout('notice', msg, job_name) if logger: logger.info(msg) return s, s0
def get_sparse_elements(proteinLinkFile, min_edge_weight): ''' to store ppi network input: dProtein2gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link output: update dProtein2gene, dGenes when add_dangled is enabled. Store ppi and lnkProteins ''' #read string DB and assign an integer to each protein symbol fp = lib_utils.open2(proteinLinkFile, 'r') nNodes = 0 linked = [-1, -1] dProtein2num = {} lnkProteins = [] ppi = [[], [], []] #from protein, to protein, link weight lib_utils.msgout( 'notice', 'preparing a genetic network matrix. Please, be patient......', 'pagerank|heat_diffusion') #store col,row,weight from ppi file fp.next() for i in fp: #print '%s'%i #debug linked[0], linked[1], weight = i.rstrip().split(' ') weight = float(weight) if weight < min_edge_weight: continue for c in range(2): protein = extract_ensembl_protein(linked[c]) #to register a protein node if not protein in dProtein2num: dProtein2num[protein] = nNodes lnkProteins.append( protein ) #item index corresponds to a node number of the protein nNodes += 1 ppi[c].append(dProtein2num[protein]) ppi[2].append(weight) fp.close() dProtein2num = None return nNodes, ppi, lnkProteins
def preprocess_dmg_scores(self): ''' -objective: -output: dictionary {gene:genetic damaged score} ''' job_name = 'preprocess_dmg_scores' gdmg = [] if self.vcf: msg='start to predict genetic damage score from variants in the provided VCF [%s]' % (job_name) lib_utils.msgout('notice',msg);self.logger.info(msg) msg = 'loading training model of CADD/GERP w.r.t AA change...' lib_utils.msgout('notice',msg);self.logger.info(msg) try: beta_fit_dill = self.entries['beta_fit'] msg='loading beta fit cdf[%s] for conservation score w.r.t. AA'%beta_fit_dill lib_utils.msgout('notice',msg); self.logger.info(msg) fp = open(beta_fit_dill, 'rb') beta_fits = dill.load(fp) fp.close() except: beta_fits = [None, None, None] # to extract some info from annotated/filterd VCF to evaluate the genetic mutation damage # [gene, indel, class_tag, protein_len, in-sillico pred score, maf_offset, zygosity] mutation_info = self._extract_mutation_info(beta_fits) # to get a gene list having genetic mutations for minfo in mutation_info: if minfo[0] not in gdmg: gdmg.append(minfo[0]) gdmg = list(set(gdmg)) if self.hpo2disease_fn: self._store_hposim_outfn(self.hpo2disease_fn, self.top_k_disease, gdmg) # to enrich phenogenes (update self.pheno_dmg) if self.hpo_query and self.dm.go_seed_k>0 and gdmg: self.enrich_pheno_genes(gdmg) if self.vcf: # combine variant location and conservation pred dmg self._predict_gt_dmg(mutation_info) elif self.hpo_query: for gene in self.pheno_dmg.iterkeys(): if gene not in self.gt_dmg: self.gt_dmg[gene] = SnvGene() self.gt_dmg[gene].score = self.pheno_dmg[gene].score msg = 'done. [%s]'%job_name lib_utils.msgout('notice',msg); self.logger.info(msg)
def get_kth_score(self,dmg,topR): msg = 'getting [%d]-th top pheno_score...'%topR lib_utils.msgout('notice',msg);self.logger.info(msg) scores = [] for scDid in dmg.itervalues(): scores.append(scDid.score) if topR<1.: s1 = round(topR*len(scores)) else: s1 = topR scores.sort(reverse=True) msg = 'selected pheno score:%g'%scores[s1] lib_utils.msgout('notice',msg);self.logger.info(msg) return scores[s1]
def get_sparse_elements(self): ''' to store ppi network input: self.dProt2Gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link output: update self.dProt2Gene, dGenes when add_dangled is enabled. Store ppi and Prots ''' #read string DB and assign an integer to each protein symbol fp = lib_utils.open2(self.dv.entries['string_link'], 'r') linked = [-1, -1] self.nNodes = 0 self.Prots = [] self.dProt2Idx = {} lib_utils.msgout( 'notice', 'preparing a genetic network matrix. Please, be patient ...', 'pagerank|heat_diffusion') #store col,row,weight from ppi file fp.next() for i in fp: #print '%s'%i #debug linked[0], linked[1], weight = i.rstrip().split() weight = float(weight) if weight < self.min_edge_weight: continue for c in range(2): protein = extract_ensembl_protein(linked[c]) #to register a protein node if not protein in self.dProt2Idx: self.dProt2Idx[protein] = self.nNodes # item index corresponds to a node number of the protein self.Prots.append(protein) self.nNodes += 1 self.ppi[c].append(self.dProt2Idx[protein]) self.ppi[2].append(weight) fp.close()
def get_boundary(self, cds_stats=['cmpl', 'incmpl', 'unk', 'none'], ext_bp=0): job_name = 'RefGeneUcscTB.get_boundary' if self.bed_fn is None: raise RuntimeError('Bed file should be set first!') msg = 'storing coding region boundaries from [%s] @ %s' % (self.bed_fn, job_name) lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) maxNumExon = int(1e6) fp = open(self.bed_fn, 'r') chromp, e1, e2, _ = fp.next().rstrip().split('\t') j = 0 fp.seek(0) for i in fp: chrom, e1, e2, _ = i.rstrip().split('\t') if chrom not in self.boundary: if j > 0: self.boundary[chromp] = np.delete(self.boundary[chromp], range(j, maxNumExon), 0) self.boundary[chrom] = np.zeros((maxNumExon, 2), dtype=int) chromp = chrom j = 0 self.boundary[chrom][j, 0] = int(e1) self.boundary[chrom][j, 1] = int(e2) j += 1 if j > 0: self.boundary[chromp] = np.delete(self.boundary[chromp], range(j, maxNumExon), 0) fp.close() msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) return self.boundary
def vfilter(self): ''' objective:apply a standard filter to VCF file and classify variants input: annotated vcf from varant (GCN) annotator output: filtered vcf ''' job_name = 'vfilter' msg = 'filtering the annotated VCF [%s;%s] ...' % (job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) filtered_vcf = self._assign_out_fn(job_name, 'vcf') msg = 'applying a standard filter/class tagging [%s]' % self.vcf lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) gcn_filter = os.path.join(self.entries['divine_root'], 'gcn', 'lib', 'utils', 'filter_cj.py') cmd = ["python", gcn_filter, \ "-i", self.vcf, \ "-o", filtered_vcf] filter_conf = self.entries['vcf_filter_conf'] cmd.extend(["-f", filter_conf]) self.logger.info('filter config [%s] is applied' % filter_conf) self.run_cmd(cmd, job_name) self.vcf = filtered_vcf msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
def ranking_vcf(self): ''' this function is obsolete and replaced by vcf2xls_varant() ''' import gcn.lib.io.vcf as vcf job_name = 'ranking_vcf' msg = 'annotating Divine prediction score into filtered VCF ... [%s;%s]' % ( job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) ranked_vcf = '%s.ranked' % self.vcf ostream = open(ranked_vcf, 'w') v = vcf.VCFParser(self.vcf) v.add_meta_info("DVN", "1", "Float",\ "Gene damage score predicted by Divine:%s"%self.command) v.writeheader(ostream) for rec in v: v.parseinfo(rec) vpop = vp.parse(rec.info) max_dmg_sc = 0. for altnum, val in vpop.items(): for gene, gd in val.items(): if gene in self.gene_dmg: if self.gene_dmg[gene] > max_dmg_sc: max_dmg_score = self.gene_dmg[gene] rec.info.DVN = max_dmg_score v.write(ostream, rec) ostream.close() v.stream.close() os.rename(ranked_vcf, self.vcf) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
def gen_adj_matrix(self, logger=None, reuse=True): if not self.ppi: raise RuntimeError( 'edge info is not available yet. run get_sparse_elements() first to load ppi edge info ...' ) dill_fn = self.dv.entries['string_link'] + '.dill' if reuse and os.path.exists(dill_fn): msg = "loading adjacent matrix computed previously and stored in [%s]" % dill_fn lib_utils.msgout('notice', msg) if logger: logger.info(msg) with open(dill_fn, 'rb') as in_strm: self.A = dill.load(in_strm) else: self.A = coo_matrix((self.ppi[2], (self.ppi[0], self.ppi[1])), \ dtype=np.float, shape=(self.nNodes, self.nNodes)) job_name = 'gen_adj_matrix' # convert to csr_matrix for faster/reliable matrix operation msg = 'reformatting the genetic network matrix.' lib_utils.msgout('notice', msg, job_name) if logger: logger.info(msg) self.A = self.A.tocsr() # normalize PPI matrix msg = 'normalizing (graph laplacian) the genetic network matrix. (it will take 4 hours!)' lib_utils.msgout('notice', msg, job_name) if logger: logger.info(msg) self.A = normalize_glap(self.A) #self.A = normalize(self.A, norm='l1', axis=0) with open(dill_fn, 'wb') as out_strm: dill.dump(self.A, out_strm)
def get_GO_seeds(self, seed_rate): ''' to collect genes associated a disease whose matching score to HPO is relatively high ''' job_name = 'get_GO_seeds' msg = 'collecting genes associated with diseases [%s] showing high HPO matching' % self.hpo2disease_fn lib_utils.msgout('notice', msg) self.logger.info(msg) #count the total number of disease hit whose score > 0. fp = anyopen.openfile(self.hpo2disease_fn) num_omim = 0 for i in fp: if i[0] == '#': continue omim, genes, score = i.rstrip().split('\t') score = float(score) if score > 0.: num_omim += 1 fp.close() t = 0 T = round(num_omim * seed_rate) fp = anyopen.openfile(self.hpo2disease_fn) go_seeds = [] for i in fp: if i[0] == '#': continue if t > T: break omim, genes, score = i.rstrip().split('\t') go_seeds.extend(genes.split(',')) t += 1 fp.close() go_seeds = list(set(go_seeds)) msg = 'total [%d] genes are chosen for GO seeds in [%d] out of [%d] diseases\n' % ( len(go_seeds), T, num_omim) msg += 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) return go_seeds
def run_cmd(self, cmd, job_name=None): cmd_str = lib_utils.joined(cmd, ' ') lib_utils.msgout('notice', cmd_str) #debug self.logger.info('running [%s] ...' % cmd_str) if job_name: stdofp, stdefp = self.get_process_msg_handler(job_name) else: stdofp = sp.PIPE stdefp = sp.PIPE proc = sp.Popen(cmd_str, stdout=stdofp, stderr=stdefp, shell=True) retcode = proc.wait() if job_name: stdofp.close() stdefp.close() if retcode > 0: self.logger.error('[%s] failed' % cmd_str) raise RuntimeError('[%s] failed' % cmd_str) self.logger.info('done. [%s]' % job_name)
def create_bed(self, ext_bp=0, reuse=False): job_name = 'RefGeneUcscTB.create_bed' self.bed_fn = os.path.join(self.work_dir,'refGene_e%d_so_merged.bed'%ext_bp) msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s'%(self.bed_fn,job_name) lib_utils.msgout('notice',msg) if self.logger: self.logger.info(msg) if reuse and lib_utils.check_if_file_valid(self.bed_fn): msg = 'reuse bed file [%s] generated previously @ %s'%(self.bed_fn,job_name) lib_utils.msgout('notice',msg) if self.logger: self.logger.info(msg) return self.bed_fn #to get a working directory tmp_bed = os.path.join(self.work_dir,'refGene_e%d.bed'%ext_bp) fp = open(self.refGene_fn,'r') fp2= open(tmp_bed,'w') for i in fp: j=i.rstrip().split('\t') chrom = j[2] for e1,e2 in zip(j[9].split(',')[:-1],j[10].split(',')[:-1]): e1_ext=int(e1)-ext_bp e2_ext=int(e2)+ext_bp fp2.write('%s\t%d\t%d\t%s;%s\n'%(chrom,e1_ext,e2_ext,j[12],j[1])) fp2.close() fp.close() self.collapse_bed(tmp_bed,job_name,ext_bp) os.unlink(tmp_bed) return self.bed_fn
def norm_genetic_dmg(self): msg = 'normalizing genetic_dmg by sum...' lib_utils.msgout('notice',msg); self.logger.info(msg) gt_dmg_min = 1. denom = 0. for cSnvGene in self.gt_dmg.itervalues(): if cSnvGene.score < gt_dmg_min: gt_dmg_min = cSnvGene.score denom += cSnvGene.score msg = '# of mutated genes:%d'%len(self.gt_dmg.keys()) msg += ', denom for normalization:%g'%denom for gene in self.gt_dmg.iterkeys(): self.gt_dmg[gene].score /= denom gt_dmg_min /= denom msg += ', done.' lib_utils.msgout('notice',msg); self.logger.info(msg)
def known_pathov_stats(reuse=True, has_hgmd_license=False): """ to retrieve variant types (LOF, missense, etc) from known pathogenic mutation database (clinvar or HGMD) :return: """ pathog_prof_pyv = fileconfig.FILECONFIG['PATHOG_PROF'] if reuse and os.path.exists(pathog_prof_pyv): msg = 'loading some statistics on known pathogenic variants (%s) ...' % pathog_prof_pyv msgout('notice', msg) fp = open(pathog_prof_pyv, 'rb') pathov_prof_gene = dill.load(fp) fp.close() else: refgene = Refgene() cds_len_per_gene = refgene.get_cds_len_per_gene() pathov_prof_gene = pathogenic_per_gene(cds_len_per_gene, hgmd_on=has_hgmd_license) fpw = open(pathog_prof_pyv, 'wb') dill.dump(pathov_prof_gene, fpw) fpw.close() #TODO: use SVM to infer optimal variables to classify benign vs. pathogenic return pathov_prof_gene
def hpo_to_diseases(self,top_k_disease=0): ''' objective: match HPO IDs from a given patint phenotype to known disease database input: hpo_query, hpo database method: hposim (funSimMax) output: phenotype similarity between patient and known diseases, store the HPO similarity into pheno_dmg ''' job_name = 'hpo_to_diseases' msg = 'matching query phenotypes to diseases in semantic HPO ontology[%s;%s]'%(job_name,self.hpo2disease_fn) lib_utils.msgout('notice',msg);self.logger.info(msg) # run hpo similarity cmd = ["python", self.entries['hposim'], \ "-q", self.hpo_query, \ "-b", self.entries['hpo_obo'], \ "-f", self.entries['ext_disease_to_gene'], \ "--normalize", \ "-o", self.hpo2disease_fn] lib_utils.runcmd2(cmd,self.log_dir,self.logger,job_name) msg = 'done. [%s]' % job_name lib_utils.msgout('notice',msg);self.logger.info(msg)
def combine_damage_scores(self): # Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior) job_name = 'combine_damage_scores' msg='combining both phenotypes[%s] and geneotype[%s] damage scores ... [%s]' %\ (self.hpo_query, self.vcf, job_name) lib_utils.msgout('notice', msg) self.logger.info(msg) pheno_info = False if self.pheno_dmg and self.dm.seed_rate > 0.: pheno_info = True #to select perturbed genes whose GO is highly similar to phenotype genes self.gene_ontology_enrichment() # to obtain min damage score for both pheno and genetic perturb pdmg_min = lib_utils.get_stat_dic(self.pheno_dmg, 'min') if pdmg_min == 0.: raise ValueError('pheno has 0 dmg score[self.pheno_dmg]') gdmg_min = lib_utils.get_stat_dic(self.genetic_dmg, 'min') if gdmg_min == 0.: raise ValueError('genetic has 0 dmg score[self.genetic_dmg]') msg = 'calculating damage scores in a Bayesian framework...' lib_utils.msgout('notice', msg, job_name) self.logger.info(msg) for gene in self.genetic_dmg.keys(): self.gene_dmg[gene] = PerturbedGene() gdmg = self.genetic_dmg[gene] self.gene_dmg[gene].gdmg = gdmg gdmg *= self.dm.gtwt if pheno_info: pdmg = pdmg_min * self.dm.prior if gene in self.pheno_dmg: pdmg = self.pheno_dmg[gene] pdmg *= self.dm.ptwt self.gene_dmg[gene].score = pdmg * gdmg / (pdmg * gdmg \ + (1. - pdmg) * (1. - gdmg)) else: self.gene_dmg[gene].score = gdmg #skip normalization since it will be done in heat_diffusion msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) return self.gene_dmg
def collapse_bed(self,tmp_bed,job_name,ext_bp): msg = 'sorting bed file ... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp) # sort lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True, tmp_so_bed) msg = 'merging exon coordinates overlapped each other... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) # merge boundaries if any overlapped fp = open(tmp_so_bed, 'r') fp2 = open(self.bed_fn, 'w') chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t') e1p = int(e1p) e2p = int(e2p) wrapup = 1; merge = 2 fp.seek(0) for i in fp: chrom, e1, e2, annot = i.rstrip().split('\t') e1 = int(e1) e2 = int(e2) if chrom == chromp: if e2p < e1: action = wrapup else: action = merge else: action = wrapup if action == wrapup: fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) chromp, e1p, e2p, annotp = chrom, e1, e2, annot elif action == merge: if e2p < e2: e2p = e2 annotp += '|%s' % annot fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) fp.close() fp2.close() os.unlink(tmp_so_bed) msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg)
def combine_pheno_gt_dmg(self): job_name = 'combine_pheno_gt_dmg' msg = 'combining both phenotypes[%s] and geneotype[%s] damage scores ... [%s]' % \ (self.hpo_query, self.vcf, job_name) lib_utils.msgout('notice', msg) self.logger.info(msg) L = len(self.gt_dmg.keys()) msg = "total number of genes to investigate [%d]" % L lib_utils.msgout('notice', msg) # to prepare final gene-level dmg score self.gene_dmg if L==0: msg = 'combine_phenotype_gt_dmg() should not be called when neither VCF nor HPO query is given!' lib_utils.msgout('error',msg) raise RuntimeError(msg) elif not self.vcf: gdmg0 = 1. / L for gene in self.gt_dmg.iterkeys(): pdmg = (1. - self.dm.ptwt) * self.gt_dmg[gene].score gdmg = gdmg0 self.gene_dmg[gene] = self.simple_bayesian_pred(pdmg, gdmg) elif not self.hpo_query: pdmg0 = 1. / L for gene in self.gt_dmg.iterkeys(): gdmg = (1. - self.dm.ptwt) * self.gt_dmg[gene].score pdmg = pdmg0 self.gene_dmg[gene] = self.simple_bayesian_pred(pdmg, gdmg) else: self.logger.info(msg) for gene in self.gt_dmg.iterkeys(): gdmg = (1. - self.dm.ptwt) * self.gt_dmg[gene].score pdmg = self.dm.ptwt * self.gt_dmg[gene].pheno_score self.gene_dmg[gene] = self.simple_bayesian_pred(pdmg, gdmg) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) return self.gene_dmg
def __init__(self, uargs): #transferring user input arguments to class member variables self.exp_tag = uargs.exp_tag self.vknown = uargs.vknown self.cadd = uargs.cadd self.excl_non_coding = False self.sparser = SafeConfigParser() self.pheno_dmg = {} self.genetic_dmg = {} self.gene_dmg = {} self.hpo2disease_fn = None self.pheno_dmg_fn = None self.hpo_query = None self.vcf = None self.xls = None self.hgmd = uargs.hgmd lib_utils.msgout('notice', 'initializing Divine ...', 'Divine') divine_root_dir = os.environ.get("DIVINE") if not divine_root_dir: raise EnvironmentError("set DIVINE variable properly!") config_fn = os.path.join(divine_root_dir, 'gcn', 'config', 'divine.conf') if not lib_utils.check_if_file_valid(config_fn): raise IOError("check if the configuration file[%s] is valid!" % config_fn) self.config_fn = config_fn self.entries = {'divine_root': divine_root_dir} self._set_args(uargs) # damage factor w.r.t the location of variant within the transcript self.dm = damaging_model.DmgCoeff(uargs.indel_mode, uargs.seed_rate, self.logger) if uargs.ref_exon_only == 1: msg = 'VCF will be masked by RefGene coding region' lib_utils.msgout('notice', msg) self.logger.info(msg) self.ref_exon_only = uargs.ref_exon_only lib_utils.msgout('notice', 'done. initialization')