Example #1
0
    def print_vkey(self, snpcnt):
        DBINDEX = ['EXAC','SNPDB','KGDB','ESP','CLINVARDB','CLINVITAE',\
                   'REGULOME','CLNPHESNP','HGMDDB',\
                   'COSMIC','NSFP','SPLICE','MIRNA','OMIM']

        if self.vKeys:
            for cVkey in self.vKeys.itervalues():

                cols = [
                    snpcnt, cVkey.chrom, cVkey.pos, cVkey.ref, cVkey.alt,
                    lib_utils.joined(cVkey.rsids, ',')
                ]

                for tb in DBINDEX:
                    tbIdx = cVkey.dbs[tb]
                    if isinstance(tbIdx, list):
                        tbIdx = lib_utils.joined(list(set(tbIdx)), ',')
                        if not tbIdx:
                            tbIdx = 'NULL'
                        cols.append(tbIdx)
                    else:
                        if not tbIdx:
                            tbIdx = 'NULL'
                        cols.append(tbIdx)

                self.fpw.write('%s\n' % (lib_utils.joined(cols, '\t')))
                del cols

        self.cleanup()
Example #2
0
def to_file(rows,Header,out_fn,fmode='wb'):
	#check if out_fn can be writable
	fp2 = anyopen.openfile(out_fn,fmode)
	if isinstance(Header,basestring):
		headStr = Header
	else:
		headStr = lib_utils.joined(Header,'\t')
	fp2.write('#%s\n'%headStr)
	
	if len(rows)>0:
		decimal_idx = get_decimal_idx(rows[0])
		fix_record = False
		if len(decimal_idx)>0:
			fix_record = True
			
		for i, r in enumerate(rows):
			r = list(r)
			if fix_record:
				r = reformat_fields(r, decimal_idx)
			fp2.write('%s\n'%lib_utils.joined(r,'\t'))
	fp2.close()
Example #3
0
    def gdna_to_vcf(self, mutalyzer_batch_outfn):

        if not os.path.exists(mutalyzer_batch_outfn):
            raise RuntimeError('check if input file [%s] exists'%\
                     mutalyzer_batch_outfn)

        cHgvs = Hgvs2()
        cHgvs.load_resource()

        fp = open(mutalyzer_batch_outfn, 'r')
        fp.next()
        gdna_cache = {}
        for mutalyzer in fp:
            mut = mutalyzer.split('\t')
            if mut[1].strip(): continue
            gdna = mut[2].strip()
            variants = cHgvs.gdna_to_vcf(gdna)
            if variants:
                gdna_cache[mut[0].strip()] = variants
        fp.close()

        self.out_vcf = lib_utils.file_tag(self.tsv, None, 'vcf')

        tmp_vcf = self.out_vcf + '.tmp'
        fpw = open(tmp_vcf, 'w')
        self._write_vcf_head(fpw)
        qual = 100
        filter = 'PASS'
        rsid = '.'

        for cvt in self._iterfile():
            if self.may_pass(cvt): continue
            if cvt.nt_change not in gdna_cache: continue

            for chrom, pos, ref, alt in gdna_cache[cvt.nt_change]:
                if len(ref) > 100 or len(alt) > 100: continue
                info = 'cDNA=%s;' % cvt.nt_change
                info += 'VC=%s;' % self.determine_vclass(cvt.rep_class)
                info += 'SRC=%s;' % cvt.source
                info += 'UPD=%s;' % cvt.last_upd
                info += 'URL=%s' % cvt.url
                if chrom.startswith('chr'):
                    if chrom.startswith('chrM'): chrom = 'MT'
                    else: chrom = chrom[3:]
                cols = [chrom, pos, rsid, ref, alt, qual, filter, info]
                fpw.write('%s\n' % lib_utils.joined(cols, '\t'))
        fpw.close()
        lib_utils.sort_tsv_by_col2(tmp_vcf,[1,2],\
         ['V','n'],False,self.out_vcf)
        os.unlink(tmp_vcf)
Example #4
0
    def print_header(self, tsv):
        DBINDEX = [['EXAC', 'snps', 'idx', []], ['SNPDB', 'snps', 'idx', []],
                   ['KGDB', 'snps', 'idx', []], ['ESP', 'snps', 'idx', []],
                   ['CLINVARDB', 'snps', 'idx', []],
                   ['CLINVITAE', 'snps', 'idx', []],
                   ['REGULOME', 'regulome', 'idx', []],
                   ['CLNPHESNP', 'clnsnp', 'idx', []],
                   ['HGMDDB', 'snps', 'idx', []],
                   ['COSMIC', 'snps', 'idx', []], ['NSFP', 'nsfp', 'idx', []],
                   ['SPLICE', 'splice', 'idx', []],
                   ['MIRNA', 'mirna', 'idx', []], ['OMIM', 'omim', 'idx', []]]

        heads = ['variant_index', 'chrom', 'pos', 'ref', 'alt', 'rsid']

        fpw = open(tsv, 'w')
        for dbindex in DBINDEX:
            heads.append('%s' % ('.'.join(dbindex[:-1])))
        fpw.write('#%s\n' % (lib_utils.joined(heads, '\t')))

        return fpw
Example #5
0
    def run_cmd(self, cmd, job_name=None):

        cmd_str = lib_utils.joined(cmd, ' ')
        lib_utils.msgout('notice', cmd_str)  #debug
        self.logger.info('running [%s] ...' % cmd_str)

        if job_name:
            stdofp, stdefp = self.get_process_msg_handler(job_name)
        else:
            stdofp = sp.PIPE
            stdefp = sp.PIPE
        proc = sp.Popen(cmd_str, stdout=stdofp, stderr=stdefp, shell=True)
        retcode = proc.wait()

        if job_name:
            stdofp.close()
            stdefp.close()

        if retcode > 0:
            self.logger.error('[%s] failed' % cmd_str)
            raise RuntimeError('[%s] failed' % cmd_str)
        self.logger.info('done. [%s]' % job_name)
Example #6
0
    def gene_ontology_enrichment(self):
        '''
		Objective:Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior)
		Input:
			-pheno_dmg = {gene1:0.2,gene2:0.9,...} #e.g. phenotype score
			-genetic_dmg = {gene2:0.4,gene3:0.3,...} #e.g. genetic score
		'''
        job_name = 'gene_ontology_enrichment'
        msg = 'enriching perturbed genes with GO semantic similarity [%s] ...' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        # collect genes from both phenotype and genotype perturbation
        pgenes = list(self.pheno_dmg.keys())
        P = len(pgenes)

        msg = 'total phenotypic genes before enrichment:%d' % P
        lib_utils.msgout('notice', msg, job_name)
        self.logger.info(msg)

        ggenes = list(self.genetic_dmg.keys())
        msg = 'total perturbed genes:%d' % len(ggenes)
        lib_utils.msgout('notice', msg, job_name)
        self.logger.info(msg)

        # draw a venn diagram and get genes not reported by phenotypes among genetic dmg genes
        priv_ggenes = lib_utils.difference(pgenes, ggenes)
        msg = 'the number of genes not associated with the given phenotypes:%d' % len(
            priv_ggenes)
        lib_utils.msgout('notice', msg, job_name)
        self.logger.info(msg)

        # to collect genes highly matched to do GO enrichment
        pgenes2 = self.get_GO_seeds(self.dm.seed_rate)  #update self.go_seeds

        #query high-scored phenotype genes against private genetic-perturbed genes and bring high-matched ones
        msg = 'quering total [%d] seed phenotype genes into SQL ...' % len(
            pgenes2)
        lib_utils.msgout('notice', msg, job_name)
        self.logger.info(msg)
        go = geneontology.Geneontology()
        goSimScores = go.get_funsim(pgenes2,
                                    priv_ggenes,
                                    min_score=self.dm.gosim_min)

        # updating the original phenotype damage score
        # weighting enriched phenotype matching score to the gene not reported in the original phenotypes
        pheno_delta = []
        for pair, go_sc in goSimScores.iteritems():

            #search for a gene matched to seed pheno gene
            if pair[0] not in self.pheno_dmg:
                gene_enriched = pair[0]
                seed_sc = self.pheno_dmg[pair[1]]
            elif pair[1] not in self.pheno_dmg:
                gene_enriched = pair[1]
                seed_sc = self.pheno_dmg[pair[0]]

            #initialize score
            if gene_enriched not in self.pheno_dmg:
                self.pheno_dmg[gene_enriched] = 0.

            #keep only maximum
            indirect_sc = seed_sc * go_sc * self.dm.go_penalty
            if indirect_sc > self.pheno_dmg[gene_enriched]:
                self.pheno_dmg[gene_enriched] = indirect_sc
                if gene_enriched not in pheno_delta:
                    pheno_delta.append(gene_enriched)

        P_delta = len(self.pheno_dmg.keys()) - P

        msg = 'Total %d perturbed genes are added by phenotype gene enrichment!\ndone. [%s]' % (
            P_delta, job_name)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
        msg = 'genes enriched by GO:[%s]' % lib_utils.joined(pheno_delta, ',')
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Example #7
0
    def reformat_to_lite(self, infile, vtype, outfile, min_cnt=1):
        jobname = "reformat_to_lite"
        msg = "working on vcf file [%s] ..." % infile
        print msg

        infoKeys = ['GENE', 'STRAND', 'CDS', 'AA', 'SNP']

        v = vcf.VCFParser(infile)

        v.add_meta_info('COSMIC_ID', '.', 'String', 'cosmic ID')
        v.add_meta_info('REG', '2', 'Integer', '1:coding, 0:noncoding')

        if not os.path.exists(outfile):
            ostream = open(outfile, 'w')
            v.writeheader(ostream, to_del_info=infoKeys)
        else:
            ostream = open(outfile, 'a')

        pk0 = 'NA'
        cosmics = []
        cnts = []
        prev_rec = None

        for rec in v:
            v.parseinfo(rec)
            pk = lib_utils.joined([rec.chrom, rec.pos, rec.ref, rec.alt], '_')

            if pk != pk0:
                pk0 = pk
                if prev_rec:
                    prev_rec.id = '.'
                    prev_rec.info['COSMIC_ID'] = cosmics
                    prev_rec.info['REG'] = vtype
                    for info_key in infoKeys:
                        v.delete_info(prev_rec, info_key)
                    if vtype == NONCODING:
                        prev_rec.info['CNT'] = '1'
                    v.write(ostream, prev_rec)

                cosmics = [rec.id[0]]
                prev_rec = rec
            else:
                pk0 = pk
                cosmics.append(rec.id[0])

        if prev_rec:
            prev_rec.id = '.'
            prev_rec.info['COSMIC_ID'] = cosmics
            prev_rec.info['REG'] = vtype
            for info_key in infoKeys:
                v.delete_info(prev_rec, info_key)

            if vtype == NONCODING:
                prev_rec.info['CNT'] = '1'
            v.write(ostream, prev_rec)

        ostream.close()
        v.stream.close()

        msg = "Done [%s]." % jobname
        print msg
Example #8
0
	def enrich_pheno_genes(self, ggenes):
		'''
		Objective:Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior)
		Input:
			-pheno_dmg = {gene1:0.2,gene2:0.9,...} #e.g. phenotype score
			-genetic_dmg = {gene2:0.4,gene3:0.3,...} #e.g. genetic score
		'''

		job_name = 'enrich_pheno_genes'
		msg = 'enriching perturbed genes with both GO semantic similarity and KEGG pathways [%s] ...' % job_name
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)

		# collect genes from both phenotype and genotype perturbation
		pgenes = list(self.pheno_dmg.keys())  # assuming that it's score >0
		P = len(pgenes)

		msg = 'total phenotypic genes before enrichment:%d' % P
		lib_utils.msgout('notice', msg, job_name);
		self.logger.info(msg)

		msg = 'total perturbed genes:%d' % len(ggenes)
		lib_utils.msgout('notice', msg, job_name);
		self.logger.info(msg)

		# draw a venn diagram and get genes not reported by phenotypes among genetic dmg genes
		priv_ggenes = lib_utils.difference(pgenes, ggenes)
		msg = 'the number of genes not associated with the given phenotypes:%d' % len(priv_ggenes)
		lib_utils.msgout('notice', msg, job_name);
		self.logger.info(msg)

		# to collect genes highly matched to do GO enrichment
		# Gene-ontology enrichment (select private members of purturbed gene that highly matched with phenotypic-scored genes and assign predicted phenotypic score instead of assigning de-novo prior)
		seed_pheno_genes, seed_scores, _ = \
			self.get_seed_genes(self.dm.go_seed_k)

		# query high-scored phenotype genes against private genetic-perturbed genes and bring high-matched ones
		msg = 'Using [%d] seed genes to enrich [%d] genetic variant genes with GO similarity ...' % (len(seed_pheno_genes),len(priv_ggenes))
		lib_utils.msgout('notice', msg, job_name);
		self.logger.info(msg)
		go = geneontology.Geneontology()
		goSimScores = go.get_funsim(seed_pheno_genes, priv_ggenes, min_score=self.dm.gosim_min)

		msg = 'Using [%d] seed genes to enrich [%d] genetic variant genes with KEGG similarity ...' % (len(seed_pheno_genes), len(priv_ggenes))
		lib_utils.msgout('notice', msg, job_name);
		self.logger.info(msg)

		# updating the original phenotype damage score
		# weighting enriched phenotype matching score to the gene not reported in the original phenotypes

		delta_pheno = {}
		if goSimScores:
			msg = 'Collecting [%d] GO enriched genes, enrichment_penality_ratio [%g] ...' % (len(goSimScores),self.dm.go_penalty)
			lib_utils.msgout('notice', msg, job_name);
			self.logger.info(msg)

			for pair, go_sc in goSimScores.iteritems():
				# search for a gene matched to seed pheno gene
				if pair[0] in priv_ggenes:
					new_gene = pair[0]
					seed_gene = pair[1]
				else:
					new_gene = pair[1]
					seed_gene = pair[0]

				score2 = go_sc * self.dm.go_penalty * self.pheno_dmg[seed_gene].score

				if score2 > 0.:
					# register enriched genes
					if new_gene not in delta_pheno:

						delta_pheno[new_gene] = lib_utils.py_struct(go=[0., None, None],
																												kegg=[0.,0.],
																												score=0.)

						delta_pheno[new_gene].go[2] = self.pheno_dmg[seed_gene].disId

					if score2 > delta_pheno[new_gene].go[0]: #keep only max score
						delta_pheno[new_gene].go[0] = score2
						delta_pheno[new_gene].go[1] = seed_gene
						delta_pheno[new_gene].go[2] = self.pheno_dmg[seed_gene].disId
						delta_pheno[new_gene].score = delta_pheno[new_gene].go[0]

			msg = 'Genes enriched by GO similarity:[%s]' % lib_utils.joined(delta_pheno.keys(), ',')
			lib_utils.msgout('notice', msg)
			self.logger.info(msg)

		seed_pheno_genes, seed_scores, mean_seed_score = \
			self.get_seed_genes(self.dm.go_seed_k * 4)  # update self.go_seeds

		msg = 'Using [%d] seed genes to enrich [%d] genetic variant genes with KEGG pathway genes ...' % (
		len(seed_pheno_genes), len(priv_ggenes))
		lib_utils.msgout('notice', msg, job_name);
		self.logger.info(msg)

		# query seed_pheno_genes to KEGG matrix and normalize the matched genes and ranking!
		keggEnriched = run_bp(seed_pheno_genes, seed_scores, priv_ggenes, kegg_genes_fn=self.entries['kegg_hsa'])

		if keggEnriched:
			msg = 'Collecting [%d] KEGG enriched genes with mean seed score [%g]...' % (len(keggEnriched),mean_seed_score)
			lib_utils.msgout('notice', msg, job_name);
			self.logger.info(msg)

			for kgene, kscore in keggEnriched.iteritems():
				# search for a gene matched to seed pheno gene
				score2 = kscore * mean_seed_score

				if score2 > 0.:
					# register enriched genes
					if kgene not in delta_pheno:

						delta_pheno[kgene] = lib_utils.py_struct(go=[0., None, None],
																										 kegg=[0],
																										 score=0.)
					if score2 > delta_pheno[kgene].kegg[0]: #keep only max score and sum two enriched scores
						delta_pheno[kgene].kegg[0] = score2
						delta_pheno[kgene].score = delta_pheno[kgene].go[0] + delta_pheno[kgene].kegg[0]

			msg = 'Genes enriched by KEGG bipartite network difussion:[%s]' % lib_utils.joined(keggEnriched.keys(), ',')
			lib_utils.msgout('notice', msg)
			self.logger.info(msg)

		max_score = -1.
		max_seed_gene = None
		msg = 'Total [%d] mutated genes that did not have any phenotype score previously are enriched. Assigning a new phenotype score to each enriched gene ...' % len(delta_pheno)
		lib_utils.msgout('notice', msg, job_name)
		self.logger.info(msg)
		if delta_pheno:
			for gene, deltaP in delta_pheno.iteritems():
				if deltaP.score > max_score:
					max_score = deltaP.score
					if deltaP.go[1]:
						max_seed_gene = deltaP.go[1]

		if max_score > 0:
			if max_seed_gene:
				max_enriched_score = self.pheno_dmg[max_seed_gene].score
			else:
				max_seed_gene = self.get_max_pheno_dmg()
				max_enriched_score = self.pheno_dmg[max_seed_gene].score
			max_scaled = max_enriched_score * self.dm.go_penalty * 2

			for ngene,deltaP in delta_pheno.iteritems():
				self.pheno_dmg[ngene] = PhenoGene()
				self.pheno_dmg[ngene].score = delta_pheno[ngene].score*max_scaled/max_score
				self.pheno_dmg[ngene].disId = deltaP.go[2]

				if deltaP.go[2]:
					self.omim.cDis[deltaP.go[2]].enriched_genes[ngene] = None
					if deltaP.go[1]:
						self.omim.cDis[deltaP.go[2]].enriched_genes[ngene] = deltaP.go[1]

		msg = 'max scaled phenotype score[%g], raw max enriched score[%g]' % (max_scaled,max_score)
		lib_utils.msgout('notice', msg, job_name)
		self.logger.info(msg)
Example #9
0
    def create_disease_rank_tab(self):

        fpw = open(self.dv.disease_rank_fn, 'w')

        headStr = """
			disease_ID
			disease_description
			inheritance
			assoc_pheno_genes(^:mutated,*:known_pathogenic)
			num_of_assoc_pheno_genes
			num_of_gt_dmg_genes
			pheno_match_score
			avg_combined_dmg_score
			max_combined_dmg_score
			avg_harmonic_score
			max_harmonic_score
			external_genes_of_interest(kegg-ppi_or_GO_enriched[harmonic_score])
			PPI-KEGG_pathway_desc
			"""

        headCols = headStr.split()

        cell_delim = ';'
        fpw.write('#%s\n' % lib_utils.joined(headCols, '\t'))

        #
        cKegg = kegg_pathway.Kegg(hsa_fn=self.dv.entries['kegg_hsa'])
        cKegg.get_hsa()

        #annotate kegg_pathway to disease
        self.dv.omim.to_kegg_hsa(cKegg.cHsa)

        #browsing whole known disease entries whose HPO sim score with the patient > 0.
        for cD in self.dv.omim.cDis.itervalues():
            if cD.pheno_score == 0.: continue

            to_print = []
            to_print.append(cD.id)  #disID
            to_print.append(cD.desc)  #disDesc
            to_print.append(divine_inc.inheritStr[cD.inherit])  #inheritance

            Genes = [[], []]
            max_rw_score = [0., 0.]
            sum_rw_score = [0., 0.]
            cnt_gene_dmg = 0
            gt2_dmg = None

            for gene in cD.genes:  #for each gene assoc with the disease
                #split into two groups (one having gt_dmg, or else), and collect max & sum act score
                if gene in self.dv.gt_dmg:
                    if self.dv.vknown and gene in self.dv.vknown_genes:
                        Genes[0].append('%s*' % gene)
                    else:
                        Genes[0].append('%s^' % gene)

                    if self.dv.gene_dmg[gene][0] > max_rw_score[0]:
                        max_rw_score[0] = self.dv.gene_dmg[gene][0]
                    sum_rw_score[0] += self.dv.gene_dmg[gene][0]
                else:
                    Genes[1].append(gene)

                #to collect max & sum on harmonic scores
                if gene in self.dv.gene_dmg:
                    if self.dv.gene_dmg[gene][1] > max_rw_score[1]:
                        max_rw_score[1] = self.dv.gene_dmg[gene][1]
                    sum_rw_score[1] += self.dv.gene_dmg[gene][1]
                    cnt_gene_dmg += 1

            #bring KEGG genes (PPI) interacted with non-mutated phenotype genes
            goi,hsa_desc = self.external_goi(\
                Genes[1],cD.kegg_hsa,cKegg.cHsa)

            #bring GO enriched genes
            for gene2 in cD.enriched_genes:
                geneStr2 = 'go(%s:%s' % (cD.enriched_genes[gene2], gene2)
                if self.dv.vknown and (gene2 in self.dv.vknown_genes):
                    geneStr2 = geneStr2 + '*'
                else:
                    geneStr2 = geneStr2 + '^'

                if gene2 in self.dv.gene_dmg:
                    goi.append('%s[%g])' % \
                     (geneStr2,self.dv.gene_dmg[gene2][1]))

            to_print.append(cell_delim.join(Genes[0] +
                                            Genes[1]))  #assoc_pheno_genes
            G = len(cD.genes)
            G_mt = len(Genes[0])
            to_print.append(G)  #num_of_assoc_pheno_genes
            to_print.append(G_mt)  #num_of_gt_dmg_genes
            to_print.append(cD.pheno_score)  #pheno_match_score
            if G_mt > 0:
                to_print.append(sum_rw_score[0] /
                                G_mt)  #avg_combined_dmg_score
            else:
                to_print.append(0.)
            to_print.append(max_rw_score[0])  #max_combined_dmg_score

            if cnt_gene_dmg > 0:
                to_print.append(sum_rw_score[1] /
                                cnt_gene_dmg)  #avg_harmonic_score
                to_print.append(max_rw_score[1])  #max_harmonic_score
            else:
                to_print.append(0.)
                to_print.append(0.)
            to_print.append(
                cell_delim.join(goi))  #partner_in_protein_network_of_interest
            if hsa_desc:
                to_print.append(
                    cell_delim.join(hsa_desc))  #kegg-pathway desc if exist
            else:
                to_print.append('NA')  #kegg-pathway desc if exist
            fpw.write('%s\n' % (lib_utils.joined(to_print, '\t')))

        fpw.close()