Beispiel #1
0
def get_varcnt(invcf):
    '''Computes number of exonic variants per gene

    Args:
        invcf(str):    VARANT annotated VCF

    Returns:
        varcnt(dictionary):    Returns exonic variant count per gene
    '''
    varcnt = {}
    vcf = VCFParser(invcf)
    for rec in vcf:
        vcf.parseinfo(rec)
        ant = vp.parse(rec.info)
        prant = vp.prio_trans(ant)
        cache = []
        for altid, antinfo in prant.items():
            if altid != 'intergenic':
                genelist = antinfo.keys()
                for gene in genelist:
                    txant = antinfo[gene]['TRANSCRIPT']
                    key = (rec.chrom, gene)
                    if 'CodingExonic' in txant.region.split('_')\
                             and txant.mutation != 'Syn' \
                             and rec.info['ESPAF'] < 5.0 and key not in cache: #TODO (to be replaced by ExAC?)
                        cache.append(key)
                        if key not in varcnt:
                            varcnt[key] = 1
                        else:
                            varcnt[key] += 1
    return varcnt
Beispiel #2
0
    def vtexonic(self, rec):
        """Check if variant is in the exonic region (varant annotated)"""
        hpm = 'StopGain StopLoss StartLoss NonSyn FrameShiftInsert FrameShiftDelete NonFrameShiftInsert NonFrameShiftDelete'.split(
        )
        splc = ['SpliceDonor', 'SpliceAcceptor']
        warn = ['CDS_NOT_MULTIPLE_OF_3']

        vpop = vp.parse(rec.info)
        for altnum, val in vpop.items():
            for gene, gd in val.items():
                if gd:
                    for t in gd['TRANSCRIPTS']:
                        if t.mutation in hpm:
                            return True
                        if t.splice in splc:
                            return True
                        if t.warning in warn:
                            return True
        return False
Beispiel #3
0
    def ranking_vcf(self):
        '''
		this function is obsolete and replaced by vcf2xls_varant()
		'''
        import gcn.lib.io.vcf as vcf
        job_name = 'ranking_vcf'

        msg = 'annotating Divine prediction score into filtered VCF ... [%s;%s]' % (
            job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        ranked_vcf = '%s.ranked' % self.vcf
        ostream = open(ranked_vcf, 'w')
        v = vcf.VCFParser(self.vcf)

        v.add_meta_info("DVN", "1", "Float",\
         "Gene damage score predicted by Divine:%s"%self.command)

        v.writeheader(ostream)

        for rec in v:
            v.parseinfo(rec)
            vpop = vp.parse(rec.info)
            max_dmg_sc = 0.
            for altnum, val in vpop.items():
                for gene, gd in val.items():
                    if gene in self.gene_dmg:
                        if self.gene_dmg[gene] > max_dmg_sc:
                            max_dmg_score = self.gene_dmg[gene]
            rec.info.DVN = max_dmg_score
            v.write(ostream, rec)

        ostream.close()
        v.stream.close()

        os.rename(ranked_vcf, self.vcf)
        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Beispiel #4
0
    def _store_variants(self, beta_fits):
        '''
		collect essential info on each variant
		'''
        job_name = '_store_variants'
        msg = 'collecting variant information and class label to determine genetic damage [%s;%s]...' % (
            job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        mutation_info = []
        v = vcf.VCFParser(self.vcf)
        for rec in v:
            v.parseinfo(rec)

            varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt)
            mut_type = varlist[0][-1]

            if ':' in rec.id[0]:
                mut_type = 'mnp'

            # collect conservation prediction score (CADD and GERP++)
            cadd_aa = './.'
            px_cadd = None
            if rec.info.CADD_raw:
                # to get CADD_raw (average)
                px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type,
                                                       rec.info.CADD_aa,
                                                       rec.info.CADD_raw,
                                                       beta_fits)

            # to get GERP++ score
            px_gerp = None
            if rec.info.GerpConserve:
                px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa,
                                              rec.info.GerpRSScore, beta_fits)

            # which score can be chosen
            px = 0.5
            if self.cadd > 0 and px_cadd is not None:
                px = px_cadd
            elif px_gerp is not None:
                px = px_gerp

            vpop = vp.parse(rec.info)
            genes = []

            # to get MAF in the order of ExAC, ESP, and 1K
            if rec.info.EXACDB:
                maf = float(rec.info.EXACAF[0])
            elif rec.info.ESPDB:
                maf = float(rec.info.ESPAF[0])
            elif rec.info.KGDB:
                maf = float(rec.info.KGAF[0])
            else:
                maf = 0.

            # to compute a significance of MAF
            maf_offset = 0.
            if maf > 0:
                maf_offset = (
                    1. - self.dm.beta1 * math.exp(1000. * maf)) / self.dm.beta2
                if maf_offset < 0.:
                    maf_offset = 0.

            # to get transcript length
            for altnum, val in vpop.items():
                # for each gene involved with the variant
                for gene, gd in val.items():
                    protein_len = self.dm.avg_protein_len
                    if gd:
                        for t in gd['TRANSCRIPTS']:
                            if t.protein_len:
                                protein_len = float(t.protein_len)
                                break

                    # store a set of essential annotation to be used for genetic damage
                    if gene not in genes:
                        mutation_info.append([
                            gene, rec.info.INDEL, rec.info.CLASS_TAG,
                            protein_len, px, maf_offset
                        ])
                        genes.append(gene)

        # done reading filterd VCF file
        v.stream.close()
        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        return mutation_info
Beispiel #5
0
    def vtexonic(self, rec):
        """
        Check if variant is in the exonic region (varant annotated)
        TODO: apply a max distance of donor/acceptor distance; a cascade filter  
        """
        hpm = 'StopGain StopLoss StartLoss NonSyn FrameShiftInsert FrameShiftDelete NonFrameShiftInsert NonFrameShiftDelete'.split(
        )
        splc = ['SpliceDonor', 'SpliceAcceptor']
        splc_coding = ['ESS', 'ESE']
        warn = ['CDS_NOT_MULTIPLE_OF_3']
        intronic = ['CodingIntronic', 'NonCodingIntronic']
        exonic = ['CodingExonic', 'NonCodingExonic']
        utr = ['UTR5', 'UTR3']

        vpop = vp.parse(rec.info)
        regions = []

        for altnum, val in vpop.items():
            for gene, gd in val.items():
                if gd:
                    #to extract recessive/dominant
                    inherited = 'as_recessive'
                    for mim in gd['MIM_PHENS']:
                        if 'AUTOSOMAL_DOMINANT' in mim:  #TODO:https://ghr.nlm.nih.gov/handbook/inheritance/inheritancepatterns
                            inherited = 'as_dominant'

                    for t in gd['TRANSCRIPTS']:
                        if self.regincl == 'all' or t.region in self.regincl:
                            region = [
                                t.trans_id, t.region, t.protein_len, inherited
                            ]
                            if t.region in exonic:  #CodingExonic,NonCodingExonic
                                if t.splice in splc_coding:
                                    region_tag = 'splc_coding'
                                    if t.mutation in hpm:
                                        region_tag += ';hpm'
                                    region.append(region_tag)
                                    regions.append(region)
                                elif t.mutation in hpm:
                                    region.append('hpm')
                                    regions.append(region)
                                elif t.warning in warn:
                                    region.append('warn')
                                    regions.append(region)
                            elif self.dconf['splice_dist'] > 0:
                                dists = []
                                tcdna = t.cdna.split('_')
                                T = len(tcdna)
                                if t.region in intronic:
                                    mObj = re.search(r'c\.(.+)[+-](\d+)',
                                                     tcdna[0])
                                    if mObj:
                                        dists.append(int(mObj.group(2)))
                                    if T > 1:
                                        mObj = re.search(
                                            r'(.+)[+-](\d+)', tcdna[1])
                                        if mObj:
                                            dists.append(int(mObj.group(2)))
                                elif t.region in utr:
                                    mObj = re.search(r'c\.[-\*](\d+)',
                                                     tcdna[0])
                                    if mObj:
                                        dists.append(int(mObj.group(1)))
                                    if T > 1:
                                        mObj = re.search(
                                            r'[-\*](\d+)', tcdna[1])
                                        if mObj:
                                            dists.append(int(mObj.group(1)))

                                if dists:
                                    if min(dists) <= self.dconf['splice_dist']:
                                        region.append('splc_ext_intron')
                                        regions.append(region)

        if regions:
            return regions
        elif self.dconf['regulome']:
            for altnum, val in vpop.items():
                for gene, gd in val.items():
                    if gd:
                        inherited = 'as_recessive'
                        for mim in gd['MIM_PHENS']:
                            if 'AUTOSOMAL_DOMINANT' in mim:  #TODO:https://ghr.nlm.nih.gov/handbook/inheritance/inheritancepatterns
                                inherited = 'as_dominant'
                                break
                        for t in gd['TRANSCRIPTS']:
                            if rec.info.RegulomeScore:
                                return [
                                    t.trans_id, t.region, t.protein_len,
                                    inherited, 'regulome'
                                ]
                            else:
                                return [
                                    t.trans_id, None, None, inherited, None
                                ]
Beispiel #6
0
 def in_gene(self, rec, genes):
     vpop = vp.parse(rec.info)
     for altnum, val in vpop.items():
         for gene, gd in val.items():
             if gene in genes:
                 return True
Beispiel #7
0
def get_gene_data(vcffile, pedigree, GQ_THRES):
    """Retrieves gene_transcript wise variants where there exits at least one
    frameshift/stopgain mutation.
    Args:
        - vcffile(str):    Input VCF file.
                           Note - VCF should be VARANT annotated.
        - pedigree(list):    [Father SampleID, Mother SampleID,
                        Child SampleID]. Expects the order in which
                        the SampleIDs are mentioned above.
        - GQ_THRES(int):    Threshold Genotype Quality

    Returns:
        - gene_data_phased(dictionary):    Genotype Phased gene_transcript
                                           wise variants where there is
                                           at least one Frameshift/
                                           Stopgain mutation.
        - gene_data_unphased(dictionary):    Genotype Unphased gene_transcript
                                             wise variants where there is
                                             at least one Frameshift/Stopgain
                                             mutation in homozygous state.
    """
    data1 = {}
    data2 = {}
    FILTER = ['PASS', 'VQSRTrancheSNP99.00to99.90']
    v = vcf.VCFParser(vcffile)
    for rec in v:
        v.parseinfo(rec)
        v.parsegenotypes(rec)
        varfltr = rec['filter']
        if len([True for flt in FILTER if flt in varfltr]) > 0:
            genotypes = check_genotype(rec, pedigree, GQ_THRES)
            if genotypes:
                pg = phase(*genotypes)
                if pg[1] == '|':
                    c1, c2 = int(pg[0]), int(pg[-1])
                    va = vp.parse(rec.info)
                    for idx, altid in enumerate([c1, c2]):
                        if altid != 0:
                            if altid in va:
                                gene = va[altid].keys()[0]
                                if len(va[altid][gene]) > 0:
                                    for ta in va[altid][gene]['TRANSCRIPTS']:
                                        if ta.region == 'CodingExonic':
                                            trans_id = ta.trans_id
                                            key = (rec.chrom, rec.pos, \
                                                   ','.join(rec.id), rec.ref, \
                                                   rec.alt[altid - 1], altid)
                                            gi = (gene, trans_id)
                                            if gi not in data1:
                                                data1[gi] = [{}, {}]
                                                data1[gi][idx][key] = \
                                                                [ta.mutation,
                                                                 pg,
                                                                 genotypes[0],
                                                                 genotypes[1]]
                                            else:
                                                data1[gi][idx][key] = \
                                                                [ta.mutation,
                                                                 pg,
                                                                 genotypes[0],
                                                                 genotypes[1]]
                else:
                    c1, c2 = int(pg[0]), int(pg[-1])
                    va = vp.parse(rec.info)
                    for altid in [c1, c2]:
                        if altid != 0:
                            if altid in va:
                                gene = va[altid].keys()[0]
                                if len(va[altid][gene]) > 0:
                                    for ta in va[altid][gene]['TRANSCRIPTS']:
                                        if ta.region == 'CodingExonic':
                                            trans_id = ta.trans_id
                                            key = (rec.chrom, rec.pos, \
                                                ','.join(rec.id), rec.ref, \
                                                   rec.alt[altid - 1], altid)
                                            gi = (gene, trans_id)
                                            if gi not in data2:
                                                data2[gi] = [{}]
                                                data2[gi][0][key] = \
                                                            [ta.mutation,
                                                             pg,
                                                             genotypes[0],
                                                             genotypes[1]]
                                            else:
                                                data2[gi][0][key] = \
                                                            [ta.mutation,
                                                             pg,
                                                             genotypes[0],
                                                             genotypes[1]]
    gene_data_phased = {}
    for k, v in data1.items():
        for e in v:
            if len(e) > 0:
                if len(e.values()) > 1:
                    if len([True for mut in [x[0] for x in e.values()] \
                            if mut.startswith('FrameShift') \
                            or mut == 'StopGain']) > 0:
                        if k not in gene_data_phased:
                            gene_data_phased[k] = [e]
                        else:
                            gene_data_phased[k].append(e)
    del data1
    gene_data_unphased = {}
    for k, v in data2.items():
        for e in v:
            if len(e) > 0:
                if len(e.values()) > 1:
                    if len([True for y in [(x[0], x[1]) for x in e.values()] \
                             if (y[0].startswith('FrameShift') or \
                                 y[0] == 'StopGain') and \
                                 int(y[1][0]) == int(y[1][2])]) > 0:
                        if k not in gene_data_unphased:
                            gene_data_unphased[k] = [e]
                        else:
                            gene_data_unphased[k].append(e)
    del data2
    return gene_data_phased, gene_data_unphased
Beispiel #8
0
	def _extract_mutation_info(self,beta_fits):
		'''
		objective: to extract (gene_qsymbol,mutation_type,variant_class_tag,transcript_length,insillico_prediction_score,MAF_significance_offset,zygosity) from annotated/filtered VCF file
			to transfer genmod information to class_tag and also get rid of some redundancy in VCF info
		'''
		job_name = '_extract_mutation_info'
		msg='collecting variant information and class label to determine genetic damage [%s;%s]...'%(job_name,self.vcf)
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		mutation_info = []
		if self.proband_id:
			rewrite_vcf = True
			v = vcf.VCFParser(self.vcf,sampleids=[self.proband_id])
			pdom,pdom0 = self.gather_pdomain_scores(v)
			v.stream.close()
			vcf_tmp = self.vcf+'.tmp'
			ostream = open(vcf_tmp, 'w')
			rmInfo = ['Exonic','Annotation','Compounds']
			v = vcf.VCFParser(self.vcf)
			v.writeheader(ostream,to_del_info = rmInfo)
		else:
			rewrite_vcf = False
			v = vcf.VCFParser(self.vcf)
			pdom,pdom0 = self.gather_pdomain_scores(v)
			v.stream.close()
			v = vcf.VCFParser(self.vcf)

		msg = 'Importing max transcript length for each gene ...'
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)

		refgene = Refgene()
		cds_lens = refgene.get_max_cds_length()
		tx_lens = {}
		for gene, cds_len in cds_lens.iteritems():
			tx_lens[gene] = int(cds_len/3.)

		ridx = 0
		for rec in v:
			
			v.parseinfo(rec)
			
			#to remove redundant gene symbols annotated by genmod but add transcript version
			if rewrite_vcf:
				for rkey in rmInfo:
					v.delete_info(rec, rkey)

				if rec.info.GeneticModels:
					genmod_tag = lib_ped.parse_genmod_inherit_model(\
												rec.info.GeneticModels[0].split(':')[1])
					rec.info.CLASS_TAG += genmod_tag
					
				v.write(ostream, rec)
			
			varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt)
			mut_type = varlist[0][-1]

			if ':' in rec.id[0]:
				mut_type = 'mnp'
			
			# collect conservation prediction score (CADD and GERP++) 
			cadd_aa = './.'
			px_cadd = None
			if rec.info.CADD_raw:
				# to get CADD_raw (average)
				px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits)
			
			# to get GERP++ score
			px_gerp = None
			if rec.info.GerpConserve:
				px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits)
			
			# which score can be chosen
			px = 0.5
			if self.cadd>0 and px_cadd is not None:
				px = px_cadd
			elif px_gerp is not None:
				px = px_gerp

			vpop = vp.parse(rec.info)
			genes = []
			
			# to get MAF in the order of ExAC, ESP, and 1K
			if rec.info.EXACDB:
				maf = get_min_maf(rec.info.EXACAF[0])
			elif rec.info.ESPDB:
				maf = get_min_maf(rec.info.ESPAF[0])
			elif rec.info.KGDB:
				maf = get_min_maf(rec.info.KGAF[0])
			else:
				maf = 0.
			
			# to compute a significance of MAF
			maf_offset = self.dm.get_maf_xoffset(maf)

			#pdom.iloc[ridx]==ridx
			pdom_idx = pdom.index[pdom.ridx == ridx].tolist()
			if pdom_idx:
				patho_p = pdom.phat_lo[pdom_idx[0]]
				patho_pden = pdom.patho_dens_p[pdom_idx[0]]
			else:
				# assign a default pathogenic domain value (15% quantile value)
				patho_p = pdom0.phat_lo
				patho_pden = pdom0.patho_dens_p

			vartype = get_var_type(rec.ref,rec.alt)

			# to get transcript length
			for altnum, val in vpop.items():
				# for each gene involved with the variant
				for gene, gd in val.items():
					protein_len = self.dm.avg_protein_len
					if gene in tx_lens:
						protein_len = tx_lens[gene]

					# store a set of essential annotation to be used for genetic damage
					if gene not in genes:
						mutation_info.append([gene, vartype, rec.info.CLASS_TAG, protein_len, px, maf_offset, patho_p, patho_pden])
						genes.append(gene)

			ridx += 1
			
		# done reading filterd VCF file
		if rewrite_vcf:
			v.stream.close()
			ostream.close()
			os.rename(vcf_tmp,self.vcf)
			
		msg = 'done. [%s]'%job_name
		lib_utils.msgout('notice',msg); self.logger.info(msg)

		return mutation_info
Beispiel #9
0
def _load(invcf, thres_af, nmethod, data=None, sc=1):
    if not data:
        data = {}
    vcfs = VCFParser(invcf)
    samples = vcfs.samples
    for rec in vcfs:
        vcfs.parseinfo(rec)
        vcfs.parsegenotypes(rec)

        if not _is_HQVar(rec.filter):  # Checks variant is PASS
            continue

        for sid in samples:
            if sid not in data:
                data[sid] = {}

            gi = rec[sid]
            gt, gq = gi.GT, gi.GQ

            # Checks if genotype is not reference or GQ >= 30
            if not _genotype_check(gt, gq):
                continue

            altid = int(gt.split('/')[1])
            var = rec.chrom + ':' + str(rec.pos) + ':' + rec.ref +\
            ':' + rec.alt[altid - 1]
            af, flag = isRare(altid, rec.info, thres_af)
            if not flag:  # Checks if variant is not Rare (AF < 5%) in ExAC
                continue

            if 'LCR' in rec.info:
                continue

            if 'CLNDBN' in rec.info:
                dn = rec.info.CLNDBN[altid - 1]
                sig_num = rec.info.CLNSIG[altid - 1]
                if '|' in sig_num:
                    sig_num = [int(e) for e in sig_num.split('|') if e != '.']
                    if sig_num:
                        sig_num.sort()
                        sig_num = sig_num[-1]
                        cln_sig = CLNSIG_MAP[sig_num]
                    else:
                        cln_sig, dn = '', ''
                elif sig_num != '.':
                    sig_num = int(sig_num)
                    cln_sig = CLNSIG_MAP[sig_num]
                else:
                    dn, cln_sig = '', ''
            else:
                dn, cln_sig = '', ''

            if 'LCR' in rec.info:
                lcr = 'LCR'
            else:
                lcr = ''

            if 'CADD_phred' in rec.info:
                val = rec.info['CADD_phred'][altid - 1]
                if val == '.':
                    cadd = ''
                else:
                    cadd = float(val)
            else:
                cadd = ''

            if len(rec.ref) == len(rec.alt[altid - 1]) and len(rec.ref) == 1:
                ada_score, rf_score = get_dbscSNV_ant(rec.chrom, rec.pos,
                                                      rec.ref,
                                                      rec.alt[altid - 1])
                if (ada_score and ada_score > 0.6) or (rf_score
                                                       and rf_score > 0.6):
                    scpred = 'Damaging'
                else:
                    scpred = ''
            else:
                ada_score, rf_score, scpred = '', '', ''
            sc_ant = [scpred, ada_score, rf_score]

            #Parse annotation and prioritize transcript
            pa = vp.prio_trans(vp.parse(rec.info))

            # Ignore the intergenic variants
            if altid not in pa:
                continue

            eqtl_flag = False
            for gene, ant in pa[altid].items():
                ta = ant['TRANSCRIPT']
                key = ta.trans_id + '_' + ta.aa
                snps3d_pred = ['', '', '', '']

                # SC-1 variant present in Clinvar as Pathogenic or Likely Pathogenic
                if sc == 1:  # Search Criteria 1
                    if (cln_sig in ['Pathogenic', 'Likely pathogenic']):
                        if gene not in data[sid]:
                            data[sid][gene] = []
                        data[sid][gene].append(
                            (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig,
                             lcr, sc_ant, snps3d_pred))

                # SC-2 variant is protein altering + SC-1
                if sc == 2:  # Search Criteria 2
                    if (_is_PASnv(ta) and _is_Damaging(altid, rec.info, ta,
                                snps3d_pred, nmethod)) or _is_NonSense(ta) \
                                or _is_Splicing(ta) or _is_PAIndel(ta) or \
                                scpred == 'Damaging' or (cln_sig in ['Pathogenic',
                                               'Likely pathogenic']):
                        if gene not in data[sid]:
                            data[sid][gene] = []
                        data[sid][gene].append(
                            (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig,
                             lcr, sc_ant, snps3d_pred))

                # SC-5 in intronic and UTR variants + SC-1 + SC-2
                if sc == 3:  # Search Criteria 3
                    if _is_Intronic(ta) or _is_UTR(ta) or _is_PASnv(ta) or \
                    _is_NonSense(ta) or _is_Splicing(ta) or _is_PAIndel(ta) \
                    or scpred == 'Damaging' or cln_sig in ['Pathogenic',
                    'Likely pathogenic']:
                        if gene not in data[sid]:
                            data[sid][gene] = []
                        data[sid][gene].append(
                            (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig,
                             lcr, sc_ant, snps3d_pred))
    return data