Ejemplo n.º 1
0
	def record_commandline(self):
		'''
		objective: record the divine run condition into logger
		'''
		import socket
		job_name = 'record_commandline'
		msg='capturing user command line [%s] ...'%job_name
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		try:
			host_name = socket.gethostname()
		except:
			host_name = 'N/A'
		self.logger.info('host:%s'%host_name)
		
		try:
			user = os.environ.get('USER')
		except:
			user = '******'
		self.logger.info('user:%s'%user)
		
		try:
			pwd = os.environ.get('PWD')
		except:
			pwd = 'N/A'
		self.logger.info('pwd:%s'%pwd)

		self.logger.info('cmd:%s'%(' '.join(sys.argv)))
		self.logger.info("divine configuration file:%s" % self.config_fn)
		
		self.logger.info('exclude_non_coding:%s'%self.excl_non_coding)
		
		msg = 'done. [%s]' % job_name
		lib_utils.msgout('notice',msg);self.logger.info(msg)
Ejemplo n.º 2
0
    def run_vcf2xls(self):
        job_name = 'run_vcf2xls'
        msg = 'converting vcf file to excel file [%s] ...' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        rank_fn_tmp = self.rank_fn + '.tmp'

        cmd = [
            "cut", "-f1,2", self.rank_fn, "|", "grep", "-v", "'#'", ">",
            rank_fn_tmp
        ]
        self.run_cmd(cmd, "extract_pred_rank")

        self.xls = self._assign_out_fn('divine', 'xls')

        cmd = ["python", self.entries['vcf2xls'], \
           "-i", self.vcf, \
           "-o", self.xls, \
           "-l", self.log_dir, \
           "-g", rank_fn_tmp]
        self.run_cmd(cmd, job_name)

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        os.unlink(rank_fn_tmp)
Ejemplo n.º 3
0
    def hpo_to_diseases(self):
        '''
		objective: match HPO IDs from a given patint phenotype to known disease database
		input: hpo_query, hpo database
		method: system call
		output: phenotype matching score w.r.t disease
		'''

        job_name = 'hpo_to_diseases'
        # prepare output file
        self.hpo2disease_fn = self._assign_out_fn(job_name, 'tsv')

        msg = 'matching query phenotypes to diseases in semantic HPO ontology[%s;%s]' % (
            job_name, self.hpo2disease_fn)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        # run hpo similarity
        cmd = ["python", self.entries['hposim'], \
           "-q", self.hpo_query, \
           "-b", self.entries['hpo_obo'], \
           "-f", self.entries['ext_disease_to_gene'], \
           "--normalize", \
           "-o", self.hpo2disease_fn]

        self.run_cmd(cmd, job_name)

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Ejemplo n.º 4
0
	def run_vcf2xls(self):
		job_name = 'run_vcf2xls'
		msg = 'converting vcf file to excel file [%s] ...'%job_name
		lib_utils.msgout('notice',msg); self.logger.info(msg)

		if not os.path.exists(self.gene_rank_fn):
			msg = "check if gene rank file [%s] exist"%self.gene_rank_fn
			print(msg); self.logger(msg); RuntimeError(msg)
		
		rank_fn_tmp = self.gene_rank_fn + '.tmp'
			
		cmd = ["cut","-f1,2",self.gene_rank_fn,"|","grep","-v","'#'",">",rank_fn_tmp]
		lib_utils.runcmd2(cmd,self.log_dir,self.logger,"extract_pred_rank")
		
		self.xls = self._assign_out_fn('divine','xls')
		
		cmd = ["python", self.entries['vcf2xls'], \
					"-i", self.vcf, \
					"-o", self.xls, \
					"-l", self.log_dir, \
					"-g", rank_fn_tmp, \
					"-k", self.vknown]
		
		lib_utils.runcmd2(cmd,self.log_dir,self.logger,job_name)
		
		msg = 'done. [%s]'%job_name
		lib_utils.msgout('notice',msg); self.logger.info(msg)
		
		os.unlink(rank_fn_tmp)
Ejemplo n.º 5
0
	def norm_pheno_dmg(self):

		msg = 'normalizing phenogenes by sum ...'
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)

		gt_dmg = pd.DataFrame({
			'gene':self.gt_dmg.keys(),
			'score':[gt.score for gt in self.gt_dmg.itervalues()]}
		)

		pn_dmg = pd.DataFrame({
			'gene':self.pheno_dmg.keys(),
			'pheno_score':[pn.score for pn in self.pheno_dmg.itervalues()]}
		)

		gt_dmg = pd.merge(gt_dmg, pn_dmg, how='left', on='gene')
		gt_dmg.loc[gt_dmg.pheno_score.isna(), 'pheno_score'] = \
			gt_dmg.pheno_score.min() * self.dm.min_dmg_prior

		gt_dmg.pheno_score /= gt_dmg.pheno_score.sum()

		for r in gt_dmg.itertuples():
			self.gt_dmg[r.gene].pheno_score = r.pheno_score
			if r.gene in self.pheno_dmg:
				self.pheno_dmg[r.gene].score = r.pheno_score

		for pgene in self.pheno_dmg:
			if not any(gt_dmg.gene == pgene):
				self.pheno_dmg[pgene] = None

		msg += ', done.'
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)
Ejemplo n.º 6
0
	def gather_pdomain_scores(self, vcfParser):

		msg = 'gathering pathogenic variant density in domains ...'
		lib_utils.msgout('notice', msg)
		self.logger.info(msg)

		pdomains = lib_utils.py_struct(ridx=[],
																	 denoms=[],
																	 benign_dens=[],
																	 vus_dens=[],
																	 patho_dens=[])

		ridx = 0
		for rec in vcfParser:
			vcfParser.parseinfo(rec)
			# to collect pdomain info
			if rec.info.PATHO_DOMAIN:
				pdoms = [float(pdom) for pdom in rec.info.PATHO_DOMAIN.split(',')]
				pdomains.ridx.append(ridx)
				pdomains.denoms.append(pdoms[0])
				pdomains.benign_dens.append(pdoms[1])
				pdomains.vus_dens.append(pdoms[2])
				pdomains.patho_dens.append(pdoms[3])

			ridx += 1

		pdomains = pd.DataFrame({'ridx': pdomains.ridx,
														 'denoms': pdomains.denoms,
														 'benign_dens': pdomains.benign_dens,
														 'vus_dens': pdomains.vus_dens,
														 'patho_dens': pdomains.patho_dens,
														 'phat_lo':None,
														 'patho_dens_p':None})

		phat = pdomains.patho_dens / (pdomains.benign_dens + pdomains.vus_dens + pdomains.patho_dens)

		tgt_z = damaging_model.get_z(confidence=0.75)

		pdomains['phat_lo'] = map(lambda x1,x2: damaging_model.ci_lower_bound(x1, x2, z=tgt_z), phat, pdomains.denoms)

		tgt_pctile = 50
		pdensl = np.log10(pdomains.patho_dens+1e-12)

		tgt_pctile_sc = np.percentile(pdensl, tgt_pctile)

		y = (pdensl >= tgt_pctile_sc).astype(np.float)
		X = pdensl[:, np.newaxis]

		model2 = LogisticRegression().fit(X, y)
		pdomains['patho_dens_p'] = model2.predict_proba(X)[:, 1]

		pdomains_default = lib_utils.py_struct(phat_lo=np.percentile(pdomains['phat_lo'], 15),
																			 patho_dens_p=np.percentile(pdomains['patho_dens_p'], 15))

		return pdomains, pdomains_default
Ejemplo n.º 7
0
	def _read_config(self,vcf_filter_cfg=None):
		'''
		objective: read configuration file
		'''
		job_name = '_read_config'
		msg = 'reading configuration file [%s;%s] ...'%(job_name,self.config_fn)
		lib_utils.msgout('notice',msg);self.logger.info(msg)

		self.sparser.read(self.config_fn)

		self._set_config('program_paths', 'varant')
		self._set_config('program_paths', 'hposim')
		self._set_config('program_paths', 'vcf2xls')
		
		self._set_config('config', 'temp_dir')
		if not vcf_filter_cfg:
			self._set_config('config', 'vcf_filter_conf')
		else:
			if os.path.exists(vcf_filter_cfg):
				self.entries['vcf_filter_conf'] = vcf_filter_cfg
			else:
				raise RuntimeError('check if the file [%s] is valid'%vcf_filter_cfg)

		self._set_config('database', 'ext_disease_to_gene')

		self._set_config('database', 'disease_desc')

		self._set_config('database', 'hpo_obo')

		self._set_config('database', 'beta_fit')
		self._set_config('database', 'string_link')
		
		'''
		to access to UCSC mysql database(hg19)
		select e2g.value, gtp.protein from ensGtp as gtp
		inner join ensemblToGeneName as e2g on e2g.name=gtp.transcript;
		'''
		self._set_config('database', 'esp_to_gene')
		self._set_config('database', 'kegg_hsa')

		# check if the file or directory all exists before long journey!
		for key, path2 in self.entries.iteritems():
			if not lib_utils.check_if_file_valid(path2):
				raise IOError('check [%s = %s] in the file [%s]' %\
										(key, path2, self.config_fn))

		msg = 'done. [%s]' % job_name
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		return self.entries
Ejemplo n.º 8
0
    def disease_to_genes(self):
        '''
		objective: from hit scores of query hpo to disease, associate disease to genes
		input:
		  -hpo2disease_fn: a file generated by hpo_to_disease()
		    '#omim\tgenes\tscore\n'
		  -gene_norm: want to normalize accumulated phenotype score per gene? [True]
		'''

        job_name = 'disease_to_genes'
        if self.hpo2disease_fn is None:
            self.hpo_to_diseases()

        msg = 'aggregating HPO hit scores of disease to each gene [%s;%s]...' % \
             (job_name,self.hpo2disease_fn)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        fp = lib_utils.open2(self.hpo2disease_fn, 'r')

        #accumulating phenotye-matching score into genes associated with the disease
        pheno_genes = {}

        for i in fp:  # for each disease
            if i.startswith('#'): continue
            i = i.rstrip()
            omim, geneStr, funsimMatAvg = i.rstrip().split('\t')
            genes = geneStr.split(',')
            funsimMatAvg = float(funsimMatAvg)
            for gene in genes:  # for each gene
                if funsimMatAvg > 0.:
                    if gene not in pheno_genes:
                        pheno_genes[gene] = 0.

                    if funsimMatAvg > pheno_genes[gene]:  #keep only maximum
                        pheno_genes[gene] = funsimMatAvg

        fp.close()

        self.pheno_dmg = lib_utils.normalize_dic(pheno_genes, 'sum')

        #print phenotypic damage scores
        self.rank_pheno_gene()

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        #clean up variables
        pheno_genes = None
Ejemplo n.º 9
0
    def vannotate(self, reuse=False):
        '''
		objective: run varant (GCN) annotator
		input: self.vcf
		output: annotated vcf
		'''
        job_name = 'vannotate'
        msg = 'annotating VCF file[%s;%s] ...' % (job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        # prepare output file
        varant_vcf = os.path.join(self.out_dir, 'divine.vcf')

        # if necessary, masking the raw vcf file
        coding_vcf = None
        if self.ref_exon_only > 0:
            if not lib_utils.check_if_file_valid(varant_vcf) or not reuse:
                cRef = annotateRegion.RefGeneUcscTB(work_dir=self.out_dir,
                                                    logger=self.logger)
                coding_bed_fn = cRef.create_bed(ext_bp=20, reuse=False)

                msg = 'extracting variants in coding region from [%s] @ %s ...' % (
                    self.vcf, job_name)
                lib_utils.msgout('notice', msg)
                self.logger.info(msg)

                coding_vcf = os.path.join(self.out_dir, 'refgene_e20.vcf')
                self.vcf = vcf_mask.by_bed(self.vcf,
                                           coding_bed_fn,
                                           coding_vcf,
                                           logger=self.logger)

                msg = 'done. @ %s' % job_name
                lib_utils.msgout('notice', msg)
                self.logger.info(msg)

        if not lib_utils.check_if_file_valid(varant_vcf) or not reuse:
            self.logger.info('annotating [%s,%s] ...' % (job_name, self.vcf))

            cmd = ["python", self.entries['varant'], \
               "-i", self.vcf, \
               "-o", varant_vcf, \
               "-l", self.log_dir]
            if self.capkit:
                cmd.extend(["-c", self.capkit, "-e", "180"])

            if self.hgmd > 0:
                cmd.extend(["--hgmd"])

            self.run_cmd(cmd, job_name)
        self.vcf = varant_vcf

        if coding_vcf:
            os.unlink(coding_vcf)

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Ejemplo n.º 10
0
	def rank_pheno_gene(self):
		job_name = 'rank_pheno_gene'
		
		msg = 'selecting genes matched by patient phenotypes ... [%s;%s]'%(job_name,self.hpo_query)
		lib_utils.msgout('notice',msg); self.logger.info(msg)
	
		tmp_fn = '%s.tmp' % self.gene_rank_fn
		fp2=open(tmp_fn,'w')
		fp2.write('#gene\tphenotypic_score\n')
		for gene,cPhenoGene in self.pheno_dmg.iteritems():
			fp2.write('%s\t%g\n'%(gene,cPhenoGene.score))
		fp2.close()
		
		lib_utils.sort_tsv_by_col2(tmp_fn,[2],['gr'],False,self.gene_rank_fn)
		msg = 'done. [%s]'%job_name
		os.unlink(tmp_fn)
		lib_utils.msgout('notice',msg); self.logger.info(msg)
Ejemplo n.º 11
0
    def run(self, vcf_fn, masked_vcf_fn):

        job_name = 'BedMaskingVCF.run'
        cRef = annotateRegion.RefGeneUcscTB(logger=self.logger)
        cRef.bed_fn = self.bed
        eBnds = cRef.get_boundary()

        msg = 'masking the vcf file [%s] by the bed file [%s] @ %s' % (
            vcf_fn, cRef.bed_fn, job_name)
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        fp = open(vcf_fn, 'r')
        fp2 = open(masked_vcf_fn, 'w')
        cmd_head = '##original_vcf=%s\n##masking_bed=%s' % (vcf_fn, self.bed)
        head_written = False

        for i in fp:
            if i[0] == '#':
                if i.startswith('##contig'):
                    if not head_written:
                        fp2.write('%s\n' % cmd_head)
                        head_written = True
                fp2.write('%s' % i)
            else:
                j = i.split('\t')
                chrom = j[0]
                chrom = to_ucsc_chrom(chrom)
                if chrom in eBnds:
                    pos1 = int(j[1]) - 1  #adjust VCF to BED coordinate
                    idx = np.nonzero((eBnds[chrom][:, 0] <= pos1)
                                     & (pos1 < eBnds[chrom][:, 1]))
                    if idx[0].size > 0:
                        fp2.write('%s' % i)
                    else:
                        pos2 = pos1 + vcf_get_max_sv_len(j[3], j[4])
                        if pos2 > pos1:
                            idx = np.nonzero((eBnds[chrom][:, 0] <= pos2)
                                             & (pos2 < eBnds[chrom][:, 1]))
                            if idx[0].size > 0:
                                fp2.write('%s' % i)
        fp.close()
        fp2.close()
        msg = 'done. @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)
Ejemplo n.º 12
0
    def heat_diffusion_core(self,gamma=2.,M=100,alpha=0.9,\
                maxIter=150,logger=None):
        job_name = 'pagerank|heat_diffusion_core'

        N = len(self.Y)
        s = np.zeros(shape=(N, 1))

        N0 = len(self.Y0)
        s0 = np.zeros(shape=(N0, 1))

        epsilon = 1e-4
        iter = 0

        msg = 'running heat diffusion on [%dx%d, gamma=%g, alpha=%g, max_iter=%d, M=%d]. Please, be patient ...' % (
            N, N, gamma, alpha, maxIter, M)
        lib_utils.msgout('notice', msg, job_name)
        if logger: logger.info(msg)

        e = 1.
        while (e >= epsilon and iter < maxIter):
            #heat diffusion
            s_new = (1. - gamma / M) * s + (gamma /
                                            M) * (alpha * self.A.dot(s) +
                                                  (1. - alpha) * self.Y)
            s0_new = (1. - gamma / M) * s0 + (gamma / M) * (1. -
                                                            alpha) * self.Y0

            #normalize
            denom = np.sum(s_new) + np.sum(s0_new)
            s_new = s_new / denom
            s0_new = s0_new / denom

            e = cal_array_distance(s_new, s) + cal_array_distance(s0_new, s0)

            s = np.copy(s_new)
            s0 = np.copy(s0_new)

            iter += 1

        msg = 'done. [iteration:%d/%d,e:%g]' % (iter, maxIter, e)
        lib_utils.msgout('notice', msg, job_name)
        if logger: logger.info(msg)

        return s, s0
Ejemplo n.º 13
0
def get_sparse_elements(proteinLinkFile, min_edge_weight):
    '''
	to store ppi network
	input: dProtein2gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link
	output: update dProtein2gene, dGenes when add_dangled is enabled. Store ppi and lnkProteins
	'''
    #read string DB and assign an integer to each protein symbol
    fp = lib_utils.open2(proteinLinkFile, 'r')

    nNodes = 0
    linked = [-1, -1]
    dProtein2num = {}
    lnkProteins = []
    ppi = [[], [], []]  #from protein, to protein, link weight

    lib_utils.msgout(
        'notice',
        'preparing a genetic network matrix. Please, be patient......',
        'pagerank|heat_diffusion')
    #store col,row,weight from ppi file
    fp.next()
    for i in fp:
        #print '%s'%i #debug
        linked[0], linked[1], weight = i.rstrip().split(' ')
        weight = float(weight)
        if weight < min_edge_weight: continue
        for c in range(2):
            protein = extract_ensembl_protein(linked[c])

            #to register a protein node
            if not protein in dProtein2num:
                dProtein2num[protein] = nNodes
                lnkProteins.append(
                    protein
                )  #item index corresponds to a node number of the protein
                nNodes += 1

            ppi[c].append(dProtein2num[protein])
        ppi[2].append(weight)
    fp.close()

    dProtein2num = None

    return nNodes, ppi, lnkProteins
Ejemplo n.º 14
0
	def preprocess_dmg_scores(self):
		'''
		-objective:
		-output: dictionary {gene:genetic damaged score}
		'''
		job_name = 'preprocess_dmg_scores'
		gdmg = []

		if self.vcf:
			msg='start to predict genetic damage score from variants in the provided VCF [%s]' % (job_name)
			lib_utils.msgout('notice',msg);self.logger.info(msg)

			msg = 'loading training model of CADD/GERP w.r.t AA change...'
			lib_utils.msgout('notice',msg);self.logger.info(msg)
			try:
				beta_fit_dill = self.entries['beta_fit']
				msg='loading beta fit cdf[%s] for conservation score w.r.t. AA'%beta_fit_dill
				lib_utils.msgout('notice',msg); self.logger.info(msg)
				fp = open(beta_fit_dill, 'rb')
				beta_fits = dill.load(fp)
				fp.close()
			except:
				beta_fits = [None, None, None]

			# to extract some info from annotated/filterd VCF to evaluate the genetic mutation damage
			# [gene, indel, class_tag, protein_len, in-sillico pred score, maf_offset, zygosity]
			mutation_info = self._extract_mutation_info(beta_fits)

			# to get a gene list having genetic mutations

			for minfo in mutation_info:
				if minfo[0] not in gdmg:
					gdmg.append(minfo[0])
				gdmg = list(set(gdmg))

		if self.hpo2disease_fn:
			self._store_hposim_outfn(self.hpo2disease_fn, self.top_k_disease, gdmg)

		# to enrich phenogenes (update self.pheno_dmg)
		if self.hpo_query and self.dm.go_seed_k>0 and gdmg:
			self.enrich_pheno_genes(gdmg)

		if self.vcf:
			# combine variant location and conservation pred dmg
			self._predict_gt_dmg(mutation_info)
		elif self.hpo_query:
			for gene in self.pheno_dmg.iterkeys():
				if gene not in self.gt_dmg:
					self.gt_dmg[gene] = SnvGene()
				self.gt_dmg[gene].score = self.pheno_dmg[gene].score

		msg = 'done. [%s]'%job_name
		lib_utils.msgout('notice',msg); self.logger.info(msg)
Ejemplo n.º 15
0
	def get_kth_score(self,dmg,topR):
		msg = 'getting [%d]-th top pheno_score...'%topR
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		scores = []
		for scDid in dmg.itervalues():
			scores.append(scDid.score)
		
		if topR<1.:
			s1 = round(topR*len(scores))
		else:
			s1 = topR
		
		scores.sort(reverse=True)
		
		msg = 'selected pheno score:%g'%scores[s1]
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		return scores[s1]
Ejemplo n.º 16
0
    def get_sparse_elements(self):
        '''
		to store ppi network
		input: self.dProt2Gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link
		output: update self.dProt2Gene, dGenes when add_dangled is enabled. Store ppi and Prots
		'''
        #read string DB and assign an integer to each protein symbol
        fp = lib_utils.open2(self.dv.entries['string_link'], 'r')

        linked = [-1, -1]

        self.nNodes = 0
        self.Prots = []
        self.dProt2Idx = {}

        lib_utils.msgout(
            'notice',
            'preparing a genetic network matrix. Please, be patient ...',
            'pagerank|heat_diffusion')
        #store col,row,weight from ppi file

        fp.next()
        for i in fp:
            #print '%s'%i #debug
            linked[0], linked[1], weight = i.rstrip().split()
            weight = float(weight)
            if weight < self.min_edge_weight: continue

            for c in range(2):
                protein = extract_ensembl_protein(linked[c])

                #to register a protein node
                if not protein in self.dProt2Idx:
                    self.dProt2Idx[protein] = self.nNodes

                    # item index corresponds to a node number of the protein
                    self.Prots.append(protein)
                    self.nNodes += 1

                self.ppi[c].append(self.dProt2Idx[protein])
            self.ppi[2].append(weight)
        fp.close()
Ejemplo n.º 17
0
    def get_boundary(self,
                     cds_stats=['cmpl', 'incmpl', 'unk', 'none'],
                     ext_bp=0):

        job_name = 'RefGeneUcscTB.get_boundary'
        if self.bed_fn is None:
            raise RuntimeError('Bed file should be set first!')

        msg = 'storing coding region boundaries from [%s] @ %s' % (self.bed_fn,
                                                                   job_name)
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        maxNumExon = int(1e6)
        fp = open(self.bed_fn, 'r')
        chromp, e1, e2, _ = fp.next().rstrip().split('\t')
        j = 0
        fp.seek(0)
        for i in fp:
            chrom, e1, e2, _ = i.rstrip().split('\t')
            if chrom not in self.boundary:
                if j > 0:
                    self.boundary[chromp] = np.delete(self.boundary[chromp],
                                                      range(j, maxNumExon), 0)
                self.boundary[chrom] = np.zeros((maxNumExon, 2), dtype=int)
                chromp = chrom
                j = 0

            self.boundary[chrom][j, 0] = int(e1)
            self.boundary[chrom][j, 1] = int(e2)
            j += 1
        if j > 0:
            self.boundary[chromp] = np.delete(self.boundary[chromp],
                                              range(j, maxNumExon), 0)
        fp.close()

        msg = 'done. @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        return self.boundary
Ejemplo n.º 18
0
    def vfilter(self):
        '''
		objective:apply a standard filter to VCF file and classify variants
		input: annotated vcf from varant (GCN) annotator
		output: filtered vcf
		'''
        job_name = 'vfilter'
        msg = 'filtering the annotated VCF [%s;%s] ...' % (job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        filtered_vcf = self._assign_out_fn(job_name, 'vcf')

        msg = 'applying a standard filter/class tagging [%s]' % self.vcf
        lib_utils.msgout('notice', msg, job_name)
        self.logger.info(msg)

        gcn_filter = os.path.join(self.entries['divine_root'], 'gcn', 'lib',
                                  'utils', 'filter_cj.py')

        cmd = ["python", gcn_filter, \
           "-i", self.vcf, \
           "-o", filtered_vcf]

        filter_conf = self.entries['vcf_filter_conf']
        cmd.extend(["-f", filter_conf])

        self.logger.info('filter config [%s] is applied' % filter_conf)
        self.run_cmd(cmd, job_name)
        self.vcf = filtered_vcf

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Ejemplo n.º 19
0
    def ranking_vcf(self):
        '''
		this function is obsolete and replaced by vcf2xls_varant()
		'''
        import gcn.lib.io.vcf as vcf
        job_name = 'ranking_vcf'

        msg = 'annotating Divine prediction score into filtered VCF ... [%s;%s]' % (
            job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        ranked_vcf = '%s.ranked' % self.vcf
        ostream = open(ranked_vcf, 'w')
        v = vcf.VCFParser(self.vcf)

        v.add_meta_info("DVN", "1", "Float",\
         "Gene damage score predicted by Divine:%s"%self.command)

        v.writeheader(ostream)

        for rec in v:
            v.parseinfo(rec)
            vpop = vp.parse(rec.info)
            max_dmg_sc = 0.
            for altnum, val in vpop.items():
                for gene, gd in val.items():
                    if gene in self.gene_dmg:
                        if self.gene_dmg[gene] > max_dmg_sc:
                            max_dmg_score = self.gene_dmg[gene]
            rec.info.DVN = max_dmg_score
            v.write(ostream, rec)

        ostream.close()
        v.stream.close()

        os.rename(ranked_vcf, self.vcf)
        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Ejemplo n.º 20
0
    def gen_adj_matrix(self, logger=None, reuse=True):

        if not self.ppi:
            raise RuntimeError(
                'edge info is not available yet. run get_sparse_elements() first to load ppi edge info ...'
            )

        dill_fn = self.dv.entries['string_link'] + '.dill'
        if reuse and os.path.exists(dill_fn):
            msg = "loading adjacent matrix computed previously and stored in [%s]" % dill_fn
            lib_utils.msgout('notice', msg)
            if logger: logger.info(msg)

            with open(dill_fn, 'rb') as in_strm:
                self.A = dill.load(in_strm)
        else:
            self.A = coo_matrix((self.ppi[2], (self.ppi[0], self.ppi[1])), \
                    dtype=np.float, shape=(self.nNodes, self.nNodes))

            job_name = 'gen_adj_matrix'
            # convert to csr_matrix for faster/reliable matrix operation
            msg = 'reformatting the genetic network matrix.'
            lib_utils.msgout('notice', msg, job_name)
            if logger: logger.info(msg)
            self.A = self.A.tocsr()

            # normalize PPI matrix
            msg = 'normalizing (graph laplacian) the genetic network matrix. (it will take 4 hours!)'
            lib_utils.msgout('notice', msg, job_name)
            if logger: logger.info(msg)

            self.A = normalize_glap(self.A)
            #self.A = normalize(self.A, norm='l1', axis=0)
            with open(dill_fn, 'wb') as out_strm:
                dill.dump(self.A, out_strm)
Ejemplo n.º 21
0
    def get_GO_seeds(self, seed_rate):
        '''
		to collect genes associated a disease whose matching score to HPO is relatively high
		'''
        job_name = 'get_GO_seeds'
        msg = 'collecting genes associated with diseases [%s] showing high HPO matching' % self.hpo2disease_fn
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        #count the total number of disease hit whose score > 0.
        fp = anyopen.openfile(self.hpo2disease_fn)
        num_omim = 0
        for i in fp:
            if i[0] == '#': continue
            omim, genes, score = i.rstrip().split('\t')
            score = float(score)
            if score > 0.:
                num_omim += 1
        fp.close()

        t = 0
        T = round(num_omim * seed_rate)
        fp = anyopen.openfile(self.hpo2disease_fn)
        go_seeds = []
        for i in fp:
            if i[0] == '#': continue
            if t > T: break
            omim, genes, score = i.rstrip().split('\t')
            go_seeds.extend(genes.split(','))
            t += 1
        fp.close()
        go_seeds = list(set(go_seeds))

        msg = 'total [%d] genes are chosen for GO seeds in [%d] out of [%d] diseases\n' % (
            len(go_seeds), T, num_omim)
        msg += 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        return go_seeds
Ejemplo n.º 22
0
    def run_cmd(self, cmd, job_name=None):

        cmd_str = lib_utils.joined(cmd, ' ')
        lib_utils.msgout('notice', cmd_str)  #debug
        self.logger.info('running [%s] ...' % cmd_str)

        if job_name:
            stdofp, stdefp = self.get_process_msg_handler(job_name)
        else:
            stdofp = sp.PIPE
            stdefp = sp.PIPE
        proc = sp.Popen(cmd_str, stdout=stdofp, stderr=stdefp, shell=True)
        retcode = proc.wait()

        if job_name:
            stdofp.close()
            stdefp.close()

        if retcode > 0:
            self.logger.error('[%s] failed' % cmd_str)
            raise RuntimeError('[%s] failed' % cmd_str)
        self.logger.info('done. [%s]' % job_name)
Ejemplo n.º 23
0
    def create_bed(self, ext_bp=0, reuse=False):

        job_name = 'RefGeneUcscTB.create_bed'
        
        self.bed_fn = os.path.join(self.work_dir,'refGene_e%d_so_merged.bed'%ext_bp)
        
        msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s'%(self.bed_fn,job_name)
        
        lib_utils.msgout('notice',msg)
        if self.logger: self.logger.info(msg)
        
        if reuse and lib_utils.check_if_file_valid(self.bed_fn):
            msg = 'reuse bed file [%s] generated previously @ %s'%(self.bed_fn,job_name)
            lib_utils.msgout('notice',msg)
            if self.logger: self.logger.info(msg)
            return self.bed_fn

        #to get a working directory
        tmp_bed = os.path.join(self.work_dir,'refGene_e%d.bed'%ext_bp)
        
        fp = open(self.refGene_fn,'r')
        fp2= open(tmp_bed,'w')
        for i in fp:
            j=i.rstrip().split('\t')
            chrom = j[2]
            
            for e1,e2 in zip(j[9].split(',')[:-1],j[10].split(',')[:-1]):
                e1_ext=int(e1)-ext_bp
                e2_ext=int(e2)+ext_bp
                fp2.write('%s\t%d\t%d\t%s;%s\n'%(chrom,e1_ext,e2_ext,j[12],j[1]))
        fp2.close()
        fp.close()
        
        self.collapse_bed(tmp_bed,job_name,ext_bp)
        os.unlink(tmp_bed)

        return self.bed_fn
Ejemplo n.º 24
0
	def norm_genetic_dmg(self):

		msg = 'normalizing genetic_dmg by sum...'
		lib_utils.msgout('notice',msg); self.logger.info(msg)
		
		gt_dmg_min = 1.
		denom = 0.
		
		for cSnvGene in self.gt_dmg.itervalues():

			if cSnvGene.score < gt_dmg_min:
				gt_dmg_min = cSnvGene.score
			denom += cSnvGene.score
		
		msg = '# of mutated genes:%d'%len(self.gt_dmg.keys())
		msg += ', denom for normalization:%g'%denom
				
		for gene in self.gt_dmg.iterkeys():
			self.gt_dmg[gene].score /= denom
			
		gt_dmg_min /= denom

		msg += ', done.'
		lib_utils.msgout('notice',msg); self.logger.info(msg)
Ejemplo n.º 25
0
def known_pathov_stats(reuse=True, has_hgmd_license=False):
    """
	to retrieve variant types (LOF, missense, etc) from known pathogenic mutation database (clinvar or HGMD)
	:return:
	"""
    pathog_prof_pyv = fileconfig.FILECONFIG['PATHOG_PROF']
    if reuse and os.path.exists(pathog_prof_pyv):
        msg = 'loading some statistics on known pathogenic variants (%s) ...' % pathog_prof_pyv
        msgout('notice', msg)
        fp = open(pathog_prof_pyv, 'rb')
        pathov_prof_gene = dill.load(fp)
        fp.close()
    else:
        refgene = Refgene()
        cds_len_per_gene = refgene.get_cds_len_per_gene()
        pathov_prof_gene = pathogenic_per_gene(cds_len_per_gene,
                                               hgmd_on=has_hgmd_license)
        fpw = open(pathog_prof_pyv, 'wb')
        dill.dump(pathov_prof_gene, fpw)
        fpw.close()

    #TODO: use SVM to infer optimal variables to classify benign vs. pathogenic

    return pathov_prof_gene
Ejemplo n.º 26
0
	def hpo_to_diseases(self,top_k_disease=0):
		'''
		objective: match HPO IDs from a given patint phenotype to known disease database
		input: hpo_query, hpo database
		method: hposim (funSimMax)
		output: phenotype similarity between patient and known diseases, store the HPO similarity into pheno_dmg  
		'''
		job_name = 'hpo_to_diseases'

		msg = 'matching query phenotypes to diseases in semantic HPO ontology[%s;%s]'%(job_name,self.hpo2disease_fn)
		lib_utils.msgout('notice',msg);self.logger.info(msg)

		# run hpo similarity
		cmd = ["python", self.entries['hposim'], \
					"-q", self.hpo_query, \
					"-b", self.entries['hpo_obo'], \
					"-f", self.entries['ext_disease_to_gene'], \
					"--normalize", \
					"-o", self.hpo2disease_fn]

		lib_utils.runcmd2(cmd,self.log_dir,self.logger,job_name)

		msg = 'done. [%s]' % job_name
		lib_utils.msgout('notice',msg);self.logger.info(msg)
Ejemplo n.º 27
0
    def combine_damage_scores(self):

        # Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior)
        job_name = 'combine_damage_scores'

        msg='combining both phenotypes[%s] and geneotype[%s] damage scores ... [%s]' %\
         (self.hpo_query, self.vcf, job_name)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        pheno_info = False
        if self.pheno_dmg and self.dm.seed_rate > 0.:
            pheno_info = True
            #to select perturbed genes whose GO is highly similar to phenotype genes
            self.gene_ontology_enrichment()

            # to obtain min damage score for both pheno and genetic perturb
            pdmg_min = lib_utils.get_stat_dic(self.pheno_dmg, 'min')
            if pdmg_min == 0.:
                raise ValueError('pheno has 0 dmg score[self.pheno_dmg]')

        gdmg_min = lib_utils.get_stat_dic(self.genetic_dmg, 'min')
        if gdmg_min == 0.:
            raise ValueError('genetic has 0 dmg score[self.genetic_dmg]')

        msg = 'calculating damage scores in a Bayesian framework...'
        lib_utils.msgout('notice', msg, job_name)
        self.logger.info(msg)

        for gene in self.genetic_dmg.keys():

            self.gene_dmg[gene] = PerturbedGene()
            gdmg = self.genetic_dmg[gene]
            self.gene_dmg[gene].gdmg = gdmg
            gdmg *= self.dm.gtwt

            if pheno_info:
                pdmg = pdmg_min * self.dm.prior
                if gene in self.pheno_dmg:
                    pdmg = self.pheno_dmg[gene]
                pdmg *= self.dm.ptwt

                self.gene_dmg[gene].score = pdmg * gdmg / (pdmg * gdmg \
                              + (1. - pdmg) * (1. - gdmg))
            else:
                self.gene_dmg[gene].score = gdmg

            #skip normalization since it will be done in heat_diffusion

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        return self.gene_dmg
Ejemplo n.º 28
0
    def collapse_bed(self,tmp_bed,job_name,ext_bp):
        msg = 'sorting bed file ... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp)
        # sort
        lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True, tmp_so_bed)

        msg = 'merging exon coordinates overlapped each other... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        # merge boundaries if any overlapped
        fp = open(tmp_so_bed, 'r')
        fp2 = open(self.bed_fn, 'w')

        chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t')
        e1p = int(e1p)
        e2p = int(e2p)

        wrapup = 1;
        merge = 2
        fp.seek(0)
        for i in fp:
            chrom, e1, e2, annot = i.rstrip().split('\t')
            e1 = int(e1)
            e2 = int(e2)
            if chrom == chromp:
                if e2p < e1:
                    action = wrapup
                else:
                    action = merge
            else:
                action = wrapup

            if action == wrapup:
                fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
                chromp, e1p, e2p, annotp = chrom, e1, e2, annot
            elif action == merge:
                if e2p < e2:
                    e2p = e2
                    annotp += '|%s' % annot
        fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
        fp.close()
        fp2.close()

        os.unlink(tmp_so_bed)

        msg = 'done. @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)
Ejemplo n.º 29
0
	def combine_pheno_gt_dmg(self):

		job_name = 'combine_pheno_gt_dmg'

		msg = 'combining both phenotypes[%s] and geneotype[%s] damage scores ... [%s]' % \
					(self.hpo_query, self.vcf, job_name)
		lib_utils.msgout('notice', msg)
		self.logger.info(msg)

		L = len(self.gt_dmg.keys())
		msg = "total number of genes to investigate [%d]" % L
		lib_utils.msgout('notice', msg)
		# to prepare final gene-level dmg score self.gene_dmg
		if L==0:
			msg = 'combine_phenotype_gt_dmg() should not be called when neither VCF nor HPO query is given!'
			lib_utils.msgout('error',msg)
			raise RuntimeError(msg)
		elif not self.vcf:
			gdmg0 = 1. / L
			for gene in self.gt_dmg.iterkeys():
				pdmg = (1. - self.dm.ptwt) * self.gt_dmg[gene].score
				gdmg = gdmg0
				self.gene_dmg[gene] = self.simple_bayesian_pred(pdmg, gdmg)
		elif not self.hpo_query:
			pdmg0 = 1. / L
			for gene in self.gt_dmg.iterkeys():
				gdmg = (1. - self.dm.ptwt) * self.gt_dmg[gene].score
				pdmg = pdmg0
				self.gene_dmg[gene] = self.simple_bayesian_pred(pdmg, gdmg)
		else:
			self.logger.info(msg)
			for gene in self.gt_dmg.iterkeys():
				gdmg = (1. - self.dm.ptwt) * self.gt_dmg[gene].score
				pdmg = self.dm.ptwt * self.gt_dmg[gene].pheno_score
				self.gene_dmg[gene] = self.simple_bayesian_pred(pdmg, gdmg)

		msg = 'done. [%s]' % job_name
		lib_utils.msgout('notice', msg)
		self.logger.info(msg)

		return self.gene_dmg
Ejemplo n.º 30
0
    def __init__(self, uargs):
        #transferring user input arguments to class member variables

        self.exp_tag = uargs.exp_tag
        self.vknown = uargs.vknown
        self.cadd = uargs.cadd

        self.excl_non_coding = False
        self.sparser = SafeConfigParser()

        self.pheno_dmg = {}
        self.genetic_dmg = {}
        self.gene_dmg = {}

        self.hpo2disease_fn = None
        self.pheno_dmg_fn = None
        self.hpo_query = None
        self.vcf = None
        self.xls = None
        self.hgmd = uargs.hgmd

        lib_utils.msgout('notice', 'initializing Divine ...', 'Divine')

        divine_root_dir = os.environ.get("DIVINE")
        if not divine_root_dir:
            raise EnvironmentError("set DIVINE variable properly!")

        config_fn = os.path.join(divine_root_dir, 'gcn', 'config',
                                 'divine.conf')

        if not lib_utils.check_if_file_valid(config_fn):
            raise IOError("check if the configuration file[%s] is valid!" %
                          config_fn)

        self.config_fn = config_fn
        self.entries = {'divine_root': divine_root_dir}
        self._set_args(uargs)

        # damage factor w.r.t the location of variant within the transcript
        self.dm = damaging_model.DmgCoeff(uargs.indel_mode, uargs.seed_rate,
                                          self.logger)

        if uargs.ref_exon_only == 1:
            msg = 'VCF will be masked by RefGene coding region'
            lib_utils.msgout('notice', msg)
            self.logger.info(msg)

        self.ref_exon_only = uargs.ref_exon_only

        lib_utils.msgout('notice', 'done. initialization')