Exemple #1
0
def train_conservation_coeff(tr_varant_filt_vcf, include_hgmd):
  
  cadd_trset = {}
  mnp_cadd_trset = {}
  gerp_trset = {}

  v = vcf.VCFParser(tr_varant_filt_vcf)
  
  for rec in v:
    v.parseinfo(rec)
    
    found = False
    if include_hgmd:
      for cclass in rec.info.CLINSIG_CLASS:
        if not found:
          if 'HGMD' in cclass:
            found = True
            break
          
    if not found:
      for cclass in rec.info.CLINSIG_CLASS:
        if not found:
          if 'CLINVARDB' in cclass or '1kMAF' in cclass:
            found = True
            break

    if not found: continue
    
    varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt)
    mut_type = varlist[0][-1]
    
    if ':' in rec.id[0]:
      mut_type = 'mnp'
    
    #aaconv = './.'
    aaconv = '.'
    if rec.info.CADD_raw: 
      #to get CADD_raw (average)
      cadd_trset, mnp_cadd_trset, aaconv = vcf.get_CADD_scores_tr(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, cadd_trset, mnp_cadd_trset)
    
    #to_get GERP score
    if rec.info.GerpConserve:
      gerp_trset = vcf.get_GERP_scores_tr(mut_type, aaconv, rec.info.GerpRSScore, gerp_trset)
      
  v.stream.close()
  return cadd_trset, mnp_cadd_trset, gerp_trset
Exemple #2
0
def predict_gender_from_VCF(single_vcf, sample_id):

    from gcn.data import pseudoautosomal_genes
    UNKNOWN, MALE, FEMALE = range(3)

    f = Filter()
    f.geneincl = pseudoautosomal_genes.PSEUDO_AUTO_GENES

    v = vcf.VCFParser(single_vcf, sampleids=[sample_id])

    gender = UNKNOWN
    chrmXY = [0, 0, 0]

    msg = "predicting gender from the sample [%s]" % single_vcf
    print msg

    for rec in v:
        if rec['chrom'] == 'chrY' or rec['chrom'] == 'Y':
            v.parsegenotypes(rec)
            v.parseinfo(rec)

            if rec[v.samples[0]].GT != './.':
                chrmXY[1] += 1

            if not f.in_gene(rec, f.geneincl):
                chrmXY[2] += 1
        elif rec['chrom'] == 'chrX' or rec['chrom'] == 'X':
            chrmXY[0] += 1

    if chrmXY[0] > 0:
        chrY2X_rate = 1. * (chrmXY[1] + chrmXY[2]) / chrmXY[0]
        if chrY2X_rate > 0.01:
            gender = MALE
        else:
            gender = FEMALE
    elif chrmXY[2] > 0:
        gender = MALE
    else:
        gender = UNKNOWN

    v.stream.close()
    msg = "gender identified [%d], Done." % gender
    print msg

    return gender
Exemple #3
0
    def ranking_vcf(self):
        '''
		this function is obsolete and replaced by vcf2xls_varant()
		'''
        import gcn.lib.io.vcf as vcf
        job_name = 'ranking_vcf'

        msg = 'annotating Divine prediction score into filtered VCF ... [%s;%s]' % (
            job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        ranked_vcf = '%s.ranked' % self.vcf
        ostream = open(ranked_vcf, 'w')
        v = vcf.VCFParser(self.vcf)

        v.add_meta_info("DVN", "1", "Float",\
         "Gene damage score predicted by Divine:%s"%self.command)

        v.writeheader(ostream)

        for rec in v:
            v.parseinfo(rec)
            vpop = vp.parse(rec.info)
            max_dmg_sc = 0.
            for altnum, val in vpop.items():
                for gene, gd in val.items():
                    if gene in self.gene_dmg:
                        if self.gene_dmg[gene] > max_dmg_sc:
                            max_dmg_score = self.gene_dmg[gene]
            rec.info.DVN = max_dmg_score
            v.write(ostream, rec)

        ostream.close()
        v.stream.close()

        os.rename(ranked_vcf, self.vcf)
        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Exemple #4
0
    import gcn.lib.utils.fileutils as fileutils
    #import gcn.lib.io.snpeff as snpeff
    import os
    import gcn.lib.io.vcf as vcf

    parser = argparse.ArgumentParser(description='Filter variants in the vcf')
    parser.add_argument('-i', dest='infile', help='input vcf file')
    parser.add_argument('-f', dest='filterconf', default="", help='filterfile')
    parser.add_argument('-o', dest='outfile', default=None, help='output file')
    parser.add_argument('-l',
                        dest="filterlist",
                        default="",
                        help='Comma separated input list to include')

    options = parser.parse_args()
    fl = []
    if options.filterlist:
        for el in open(options.filterlist, 'rU'):
            fl.append(el.strip().split(','))
        f = Filter(fl)
    else:
        f = Filter(options.filterconf)
    v = vcf.VCFParser(options.infile)
    ostream = open(options.outfile, 'w')
    v.writeheader(ostream)
    for rec in v:
        v.parseinfo(rec)
        v.parsegenotypes(rec)
        if f.retain(rec):
            v.write(ostream, rec)
Exemple #5
0
    def _store_variants(self, beta_fits):
        '''
		collect essential info on each variant
		'''
        job_name = '_store_variants'
        msg = 'collecting variant information and class label to determine genetic damage [%s;%s]...' % (
            job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        mutation_info = []
        v = vcf.VCFParser(self.vcf)
        for rec in v:
            v.parseinfo(rec)

            varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt)
            mut_type = varlist[0][-1]

            if ':' in rec.id[0]:
                mut_type = 'mnp'

            # collect conservation prediction score (CADD and GERP++)
            cadd_aa = './.'
            px_cadd = None
            if rec.info.CADD_raw:
                # to get CADD_raw (average)
                px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type,
                                                       rec.info.CADD_aa,
                                                       rec.info.CADD_raw,
                                                       beta_fits)

            # to get GERP++ score
            px_gerp = None
            if rec.info.GerpConserve:
                px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa,
                                              rec.info.GerpRSScore, beta_fits)

            # which score can be chosen
            px = 0.5
            if self.cadd > 0 and px_cadd is not None:
                px = px_cadd
            elif px_gerp is not None:
                px = px_gerp

            vpop = vp.parse(rec.info)
            genes = []

            # to get MAF in the order of ExAC, ESP, and 1K
            if rec.info.EXACDB:
                maf = float(rec.info.EXACAF[0])
            elif rec.info.ESPDB:
                maf = float(rec.info.ESPAF[0])
            elif rec.info.KGDB:
                maf = float(rec.info.KGAF[0])
            else:
                maf = 0.

            # to compute a significance of MAF
            maf_offset = 0.
            if maf > 0:
                maf_offset = (
                    1. - self.dm.beta1 * math.exp(1000. * maf)) / self.dm.beta2
                if maf_offset < 0.:
                    maf_offset = 0.

            # to get transcript length
            for altnum, val in vpop.items():
                # for each gene involved with the variant
                for gene, gd in val.items():
                    protein_len = self.dm.avg_protein_len
                    if gd:
                        for t in gd['TRANSCRIPTS']:
                            if t.protein_len:
                                protein_len = float(t.protein_len)
                                break

                    # store a set of essential annotation to be used for genetic damage
                    if gene not in genes:
                        mutation_info.append([
                            gene, rec.info.INDEL, rec.info.CLASS_TAG,
                            protein_len, px, maf_offset
                        ])
                        genes.append(gene)

        # done reading filterd VCF file
        v.stream.close()
        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        return mutation_info
Exemple #6
0
    #create filter class
    f = Filter(options.filterconf)

    #read gene list file to tell which genes are to be included/excluded
    f.geneincl = f.store_genelist('incl', options.genelist)
    f.geneexcl = f.store_genelist('excl', options.genelist)

    min_depth = 0
    if 'min_depth' in f.dconf:
        min_depth = f.dconf['min_depth']

    f.get_clinvar_pathogenes()

    if options.sample_id:
        v = vcf.VCFParser(options.infile, sampleids=[options.sample_id])
    else:
        v = vcf.VCFParser(options.infile)

    if options.skip_parse_genotype:
        gt_sample = False
    else:
        if v._sampleids:
            sidx = v._sampleids[0]
            gt_sample = True
        else:
            gt_sample = False

    ostream = open(options.outfile, 'w')
    v.add_meta_info(
        "CLASS_TAG", "1", "String",
Exemple #7
0
    def reformat_to_lite(self, infile, vtype, outfile, min_cnt=1):
        jobname = "reformat_to_lite"
        msg = "working on vcf file [%s] ..." % infile
        print msg

        infoKeys = ['GENE', 'STRAND', 'CDS', 'AA', 'SNP']

        v = vcf.VCFParser(infile)

        v.add_meta_info('COSMIC_ID', '.', 'String', 'cosmic ID')
        v.add_meta_info('REG', '2', 'Integer', '1:coding, 0:noncoding')

        if not os.path.exists(outfile):
            ostream = open(outfile, 'w')
            v.writeheader(ostream, to_del_info=infoKeys)
        else:
            ostream = open(outfile, 'a')

        pk0 = 'NA'
        cosmics = []
        cnts = []
        prev_rec = None

        for rec in v:
            v.parseinfo(rec)
            pk = lib_utils.joined([rec.chrom, rec.pos, rec.ref, rec.alt], '_')

            if pk != pk0:
                pk0 = pk
                if prev_rec:
                    prev_rec.id = '.'
                    prev_rec.info['COSMIC_ID'] = cosmics
                    prev_rec.info['REG'] = vtype
                    for info_key in infoKeys:
                        v.delete_info(prev_rec, info_key)
                    if vtype == NONCODING:
                        prev_rec.info['CNT'] = '1'
                    v.write(ostream, prev_rec)

                cosmics = [rec.id[0]]
                prev_rec = rec
            else:
                pk0 = pk
                cosmics.append(rec.id[0])

        if prev_rec:
            prev_rec.id = '.'
            prev_rec.info['COSMIC_ID'] = cosmics
            prev_rec.info['REG'] = vtype
            for info_key in infoKeys:
                v.delete_info(prev_rec, info_key)

            if vtype == NONCODING:
                prev_rec.info['CNT'] = '1'
            v.write(ostream, prev_rec)

        ostream.close()
        v.stream.close()

        msg = "Done [%s]." % jobname
        print msg
Exemple #8
0
def get_gene_data(vcffile, pedigree, GQ_THRES):
    """Retrieves gene_transcript wise variants where there exits at least one
    frameshift/stopgain mutation.
    Args:
        - vcffile(str):    Input VCF file.
                           Note - VCF should be VARANT annotated.
        - pedigree(list):    [Father SampleID, Mother SampleID,
                        Child SampleID]. Expects the order in which
                        the SampleIDs are mentioned above.
        - GQ_THRES(int):    Threshold Genotype Quality

    Returns:
        - gene_data_phased(dictionary):    Genotype Phased gene_transcript
                                           wise variants where there is
                                           at least one Frameshift/
                                           Stopgain mutation.
        - gene_data_unphased(dictionary):    Genotype Unphased gene_transcript
                                             wise variants where there is
                                             at least one Frameshift/Stopgain
                                             mutation in homozygous state.
    """
    data1 = {}
    data2 = {}
    FILTER = ['PASS', 'VQSRTrancheSNP99.00to99.90']
    v = vcf.VCFParser(vcffile)
    for rec in v:
        v.parseinfo(rec)
        v.parsegenotypes(rec)
        varfltr = rec['filter']
        if len([True for flt in FILTER if flt in varfltr]) > 0:
            genotypes = check_genotype(rec, pedigree, GQ_THRES)
            if genotypes:
                pg = phase(*genotypes)
                if pg[1] == '|':
                    c1, c2 = int(pg[0]), int(pg[-1])
                    va = vp.parse(rec.info)
                    for idx, altid in enumerate([c1, c2]):
                        if altid != 0:
                            if altid in va:
                                gene = va[altid].keys()[0]
                                if len(va[altid][gene]) > 0:
                                    for ta in va[altid][gene]['TRANSCRIPTS']:
                                        if ta.region == 'CodingExonic':
                                            trans_id = ta.trans_id
                                            key = (rec.chrom, rec.pos, \
                                                   ','.join(rec.id), rec.ref, \
                                                   rec.alt[altid - 1], altid)
                                            gi = (gene, trans_id)
                                            if gi not in data1:
                                                data1[gi] = [{}, {}]
                                                data1[gi][idx][key] = \
                                                                [ta.mutation,
                                                                 pg,
                                                                 genotypes[0],
                                                                 genotypes[1]]
                                            else:
                                                data1[gi][idx][key] = \
                                                                [ta.mutation,
                                                                 pg,
                                                                 genotypes[0],
                                                                 genotypes[1]]
                else:
                    c1, c2 = int(pg[0]), int(pg[-1])
                    va = vp.parse(rec.info)
                    for altid in [c1, c2]:
                        if altid != 0:
                            if altid in va:
                                gene = va[altid].keys()[0]
                                if len(va[altid][gene]) > 0:
                                    for ta in va[altid][gene]['TRANSCRIPTS']:
                                        if ta.region == 'CodingExonic':
                                            trans_id = ta.trans_id
                                            key = (rec.chrom, rec.pos, \
                                                ','.join(rec.id), rec.ref, \
                                                   rec.alt[altid - 1], altid)
                                            gi = (gene, trans_id)
                                            if gi not in data2:
                                                data2[gi] = [{}]
                                                data2[gi][0][key] = \
                                                            [ta.mutation,
                                                             pg,
                                                             genotypes[0],
                                                             genotypes[1]]
                                            else:
                                                data2[gi][0][key] = \
                                                            [ta.mutation,
                                                             pg,
                                                             genotypes[0],
                                                             genotypes[1]]
    gene_data_phased = {}
    for k, v in data1.items():
        for e in v:
            if len(e) > 0:
                if len(e.values()) > 1:
                    if len([True for mut in [x[0] for x in e.values()] \
                            if mut.startswith('FrameShift') \
                            or mut == 'StopGain']) > 0:
                        if k not in gene_data_phased:
                            gene_data_phased[k] = [e]
                        else:
                            gene_data_phased[k].append(e)
    del data1
    gene_data_unphased = {}
    for k, v in data2.items():
        for e in v:
            if len(e) > 0:
                if len(e.values()) > 1:
                    if len([True for y in [(x[0], x[1]) for x in e.values()] \
                             if (y[0].startswith('FrameShift') or \
                                 y[0] == 'StopGain') and \
                                 int(y[1][0]) == int(y[1][2])]) > 0:
                        if k not in gene_data_unphased:
                            gene_data_unphased[k] = [e]
                        else:
                            gene_data_unphased[k].append(e)
    del data2
    return gene_data_phased, gene_data_unphased
Exemple #9
0
def run(infile,
        outfile,
        hdrfile,
        genefile,
        vknown,
        samples=None,
        fl=[[], [], [], []]):
    logger.info('Running vcf to xls conversion script')
    genescores = []
    if genefile:
        g = open(genefile, 'r')
        genescores = [(i.strip('\n').split('\t')) for i in g.readlines()]
        g.close()
    if samples:
        parser = vcf.VCFParser(infile, samples)
    else:
        parser = vcf.VCFParser(infile)
        samples = parser.samples

    # Write the column definitions to a tab
    col_map = defaultdict(str)
    ikeys = parser.meta['INFO'].keys()
    if hdrfile:
        hf = open(hdrfile, 'r')
        col_map = {
            i.strip('\n').split('\t')[0]: i.strip('\n').split('\t')[1]
            for i in hf
        }
        hf.close()
    for key in ikeys:
        col_map[key] = parser.meta['INFO'][key][-1]
    colnames, infokeys, formkeys, samplehdrs = columnnames(parser, samples)
    opthdrs = []
    if 'VARANT_IMPACTCODE' in infokeys:
        opthdrs.append('VARANT_IMPACTCODE')
    if genefile:
        opthdrs.append('Comments_on_Genes')
    book, sheets = create_book(colnames, samplehdrs, opthdrs)
    tempidx = 1
    logger.info('Writing the column definitions in Column Def sheet')
    for key, value in sorted(col_map.items()):
        if [j[0] for j in IN_TRANS_HDRS + opthdrs +  ['Gene_Name'] \
         + IN_VAR_HDRS if key.lower() in j.lower()] or\
           [j[0] for j in colnames if key.lower() in j[0].lower()]:
            sheets['coldef'].row(tempidx).set_cell_text(0, key)
            sheets['coldef'].row(tempidx).set_cell_text(1, value)
            tempidx = tempidx + 1
    f = Filter(fl)
    rowidx = 1
    senum = 1
    excelrows = 0
    basename = outfile[:-4]
    booknum = 2
    logger.info('Writing data to VCF tab')
    logger.info('The fields from the varant annotation are '\
        + ','.join(IN_TRANS_HDRS + IN_VAR_HDRS))
    for rec in parser:
        parser.parseinfo(rec)
        if excelrows > 60000:
            logger.info('Write out book and create a new one as the number\
						 of rows reached 60000')
            write(book, outfile)
            outfile = basename + '_' + str(booknum) + '.xls'
            book, sheets = create_book(colnames, samplehdrs, opthdrs)
            tempidx = 1
            for key, value in sorted(col_map.items()):
                if [j[0] for j in IN_TRANS_HDRS + opthdrs +  ['Gene_Name'] \
                      + IN_VAR_HDRS if key.lower() in j.lower()] or\
                      [j[0] for j in colnames if key.lower() in j[0].lower()]:
                    sheets['coldef'].row(tempidx).set_cell_text(0, key)
                    sheets['coldef'].row(tempidx).set_cell_text(1, value)
                    tempidx = tempidx + 1
            excelrows = 0
            booknum += 1
            rowidx = 1
            senum = 1
        if f.retain(rec):
            pass
        else:
            continue
        excelrows += 1

        if samples:
            parser.parsegenotypes(rec)

        row = sheets['vcf'].row(rowidx)
        idx = 0
        for e, t in colnames[:7]:
            if e is 'FILTER' and rec[e.lower()][0] is '.':
                write_cell(row, idx, e.lower(), None, t, ';')
            else:
                write_cell(row, idx, e.lower(), rec[e.lower()], t, ';')
            idx += 1
        info = rec.info
        for e, t in infokeys:
            if e not in IN_EFF:
                write_cell(row, idx, e, info.get(e, None), t)
                idx += 1
        par_ant = parse(info)
        ga = get_prior_geneannot(info, alltrans=True)
        idx = write_varant_links(row, rec, ga, samples, formkeys, colnames,
                                 idx, vknown, genescores)
        senum += 1
        sheets['vcf'].panes_frozen = True
        sheets['vcf'].remove_splits = True
        sheets['vcf'].vert_split_pos = 2
        sheets['vcf'].horz_split_pos = 1
        for s in samples:
            ss = getattr(rec, s)
            for e, t in formkeys:
                v = ss.get(e, None)
                write_cell(row, idx, e, v, t)
                idx += 1

        rowidx += 1
        if not excelrows % 500:
            sheets['vcf'].flush_row_data()
    write(book, outfile)
Exemple #10
0
	def _extract_mutation_info(self,beta_fits):
		'''
		objective: to extract (gene_qsymbol,mutation_type,variant_class_tag,transcript_length,insillico_prediction_score,MAF_significance_offset,zygosity) from annotated/filtered VCF file
			to transfer genmod information to class_tag and also get rid of some redundancy in VCF info
		'''
		job_name = '_extract_mutation_info'
		msg='collecting variant information and class label to determine genetic damage [%s;%s]...'%(job_name,self.vcf)
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		mutation_info = []
		if self.proband_id:
			rewrite_vcf = True
			v = vcf.VCFParser(self.vcf,sampleids=[self.proband_id])
			pdom,pdom0 = self.gather_pdomain_scores(v)
			v.stream.close()
			vcf_tmp = self.vcf+'.tmp'
			ostream = open(vcf_tmp, 'w')
			rmInfo = ['Exonic','Annotation','Compounds']
			v = vcf.VCFParser(self.vcf)
			v.writeheader(ostream,to_del_info = rmInfo)
		else:
			rewrite_vcf = False
			v = vcf.VCFParser(self.vcf)
			pdom,pdom0 = self.gather_pdomain_scores(v)
			v.stream.close()
			v = vcf.VCFParser(self.vcf)

		msg = 'Importing max transcript length for each gene ...'
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)

		refgene = Refgene()
		cds_lens = refgene.get_max_cds_length()
		tx_lens = {}
		for gene, cds_len in cds_lens.iteritems():
			tx_lens[gene] = int(cds_len/3.)

		ridx = 0
		for rec in v:
			
			v.parseinfo(rec)
			
			#to remove redundant gene symbols annotated by genmod but add transcript version
			if rewrite_vcf:
				for rkey in rmInfo:
					v.delete_info(rec, rkey)

				if rec.info.GeneticModels:
					genmod_tag = lib_ped.parse_genmod_inherit_model(\
												rec.info.GeneticModels[0].split(':')[1])
					rec.info.CLASS_TAG += genmod_tag
					
				v.write(ostream, rec)
			
			varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt)
			mut_type = varlist[0][-1]

			if ':' in rec.id[0]:
				mut_type = 'mnp'
			
			# collect conservation prediction score (CADD and GERP++) 
			cadd_aa = './.'
			px_cadd = None
			if rec.info.CADD_raw:
				# to get CADD_raw (average)
				px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits)
			
			# to get GERP++ score
			px_gerp = None
			if rec.info.GerpConserve:
				px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits)
			
			# which score can be chosen
			px = 0.5
			if self.cadd>0 and px_cadd is not None:
				px = px_cadd
			elif px_gerp is not None:
				px = px_gerp

			vpop = vp.parse(rec.info)
			genes = []
			
			# to get MAF in the order of ExAC, ESP, and 1K
			if rec.info.EXACDB:
				maf = get_min_maf(rec.info.EXACAF[0])
			elif rec.info.ESPDB:
				maf = get_min_maf(rec.info.ESPAF[0])
			elif rec.info.KGDB:
				maf = get_min_maf(rec.info.KGAF[0])
			else:
				maf = 0.
			
			# to compute a significance of MAF
			maf_offset = self.dm.get_maf_xoffset(maf)

			#pdom.iloc[ridx]==ridx
			pdom_idx = pdom.index[pdom.ridx == ridx].tolist()
			if pdom_idx:
				patho_p = pdom.phat_lo[pdom_idx[0]]
				patho_pden = pdom.patho_dens_p[pdom_idx[0]]
			else:
				# assign a default pathogenic domain value (15% quantile value)
				patho_p = pdom0.phat_lo
				patho_pden = pdom0.patho_dens_p

			vartype = get_var_type(rec.ref,rec.alt)

			# to get transcript length
			for altnum, val in vpop.items():
				# for each gene involved with the variant
				for gene, gd in val.items():
					protein_len = self.dm.avg_protein_len
					if gene in tx_lens:
						protein_len = tx_lens[gene]

					# store a set of essential annotation to be used for genetic damage
					if gene not in genes:
						mutation_info.append([gene, vartype, rec.info.CLASS_TAG, protein_len, px, maf_offset, patho_p, patho_pden])
						genes.append(gene)

			ridx += 1
			
		# done reading filterd VCF file
		if rewrite_vcf:
			v.stream.close()
			ostream.close()
			os.rename(vcf_tmp,self.vcf)
			
		msg = 'done. [%s]'%job_name
		lib_utils.msgout('notice',msg); self.logger.info(msg)

		return mutation_info
Exemple #11
0
	def __init__(self, uargs):
		#transferring user input arguments to class member variables
		
		self.to_delete_fns = []
		self.exp_tag = uargs.exp_tag
		self.vknown = uargs.vknown
		self.cadd = uargs.cadd
		self.top_k_disease = uargs.top_k_disease
		
		self.excl_non_coding = False
		self.sparser = SafeConfigParser()
		
		self.omim = None
		
		self.pheno_dmg = {}
		self.gt_dmg = {}
		self.gene_dmg = {}
		self.vknown_genes = {}
		
		lib_utils.msgout('notice','initializing Divine ...','Divine')
		
		divine_root_dir = os.environ.get("DIVINE")
		if not divine_root_dir:
			raise EnvironmentError("set DIVINE variable properly!")
		
		config_fn = os.path.join(divine_root_dir,'gcn','config','divine.conf')

		if not lib_utils.check_if_file_valid(config_fn):
			raise IOError("check if the configuration file[%s] is valid!" % config_fn)
		
		self.config_fn = config_fn
		self.entries = {'divine_root':divine_root_dir}
		self._set_args(uargs)

		self.hpo_query = uargs.hpo_query
		if self.hpo_query is None:
			self.hpo2disease_fn = None
			self.pheno_dmg_fn = None
			self.disease_rank_fn = None
		else:
			self.hpo2disease_fn = self._assign_out_fn('hpo_to_diseases','tsv')
			self.pheno_dmg_fn = self._assign_out_fn('pheno_gene_rank','tsv')
			self.disease_rank_fn = self._assign_out_fn('diseases_rank','tsv')

		self.gene_rank_fn = self._assign_out_fn('gene_rank', 'tsv')
		self.vcf = uargs.vcf
		self.ped = None
		self.proband_id = None
		self.genotype = True
		
		if self.vcf:
			self.is_family_vcf = False
			if uargs.ped:
				self.is_family_vcf = True
				if uargs.proband_id:
					proband_idx = lib_ped.check_consistency_ped_vcf(\
															self.vcf,uargs.ped,uargs.proband_id)
					self.ped = uargs.ped
					self.proband_id = uargs.proband_id
				else:
					msg = "A family file [%s] was provided but you didn't provide a proband ID to examine. Specify the probrand ID available in the VCF [%s] using an option -p."\
						%(uargs.ped,self.vcf)
					print(msg)
					raise RuntimeError(msg)

			else:
				#get sample_ids contained into VCF file
				v = vcf.VCFParser(self.vcf)
				if len(v.samples) > 1:
					raise RuntimeError('VCF file [%s] contains more than two samples. Let me know which sample is a proband to diagnose!'%self.vcf)
				elif len(v.samples) == 1:
					#search sample_id and create a temp ped for the proband
					self.ped = os.path.join(self.out_dir,'proband_tmp.ped')
					self.proband_id = lib_ped.create_proband_ped(self.vcf,self.ped)
					self.to_delete_fns.append(self.ped)
				else:
					self.genotype = False
		
		self.xls = None
		self.hgmd = uargs.hgmd
		self.cosmic = uargs.cosmic
		self.dblink = uargs.dblink
		
		# damage factor w.r.t the location of variant within the transcript
		self.dm = damaging_model.DmgCoeff(\
			uargs.indel_fidel,uargs.go_seed_k,self.logger)
		
		if uargs.ref_exon_only==1:
			msg = 'VCF is going to be masked by RefGene coding region'
			lib_utils.msgout('notice',msg);self.logger.info(msg)

		self.ref_exon_only = uargs.ref_exon_only

		lib_utils.msgout('notice','done. initialization')
def append_annotation_to_vcf2(vcf_fn, vars_to_summuary, submissions, out_vcf):

    print 'appending annotation to clinvar VCF file ...'
    v = vcf.VCFParser(vcf_fn)
    ostream = open2(out_vcf, 'w')

    v.add_meta_info("REFTX", "1", "String", "RefSeq Transcript Name")
    v.add_meta_info("HGVSc", "1", "String",
                    "HGVSc change in HGVS nomenclature")
    v.add_meta_info("HGVSp", "1", "String", "AA change in HGVS nomenclature")
    v.add_meta_info("SPLOC", "1", "Integer",
                    "Distance from the predicted splice site")
    v.add_meta_info("DATE", "1", "String", "Last evaluated date")
    v.add_meta_info("REV", "1", "String", "Review status")
    v.add_meta_info("CLNMETHOD", "1", "String", "Collection methods")
    v.writeheader(ostream)

    for rec in v:
        v.parseinfo(rec)

        # clnacc = re.split('[|,]', rec.info.CLNACC)
        # rec.info.CLNACC = '|'.join(list(set(clnacc)))

        uniq_rcv_ids = []
        for rcv_id_str in rec.info.CLNACC:
            for rcv_id in rcv_id_str.split('|'):
                if rcv_id in uniq_rcv_ids: continue
                uniq_rcv_ids.append(rcv_id)

        # print 'rec.info.CLNACC:',rec.info.CLNACC #cj_debug
        for rcv_id in uniq_rcv_ids:

            rcv_id = rcv_id.split('.')[0]
            if rcv_id in vars_to_summuary:
                rec.info.REFTX = vars_to_summuary[rcv_id].REFTX
                if vars_to_summuary[rcv_id].HGVSc:
                    rec.info.HGVSc = vars_to_summuary[rcv_id].HGVSc
                    mObj = re.search(r'c\.(.*)([\+\-]\d+)\D+', rec.info.HGVSc)
                    if mObj:
                        SPLOC = mObj.group(2)
                        if abs(int(SPLOC)) < 3:
                            rec.info.SPLOC = SPLOC

                if vars_to_summuary[rcv_id].HGVSp:
                    rec.info.HGVSp = vars_to_summuary[rcv_id].HGVSp
                if vars_to_summuary[rcv_id].DATE:
                    rec.info.DATE = vars_to_summuary[rcv_id].DATE
                if vars_to_summuary[rcv_id].REV:
                    rec.info.REV = vars_to_summuary[rcv_id].REV
                if vars_to_summuary[rcv_id].variation_id in submissions:
                    cmethods = list(
                        set(submissions[vars_to_summuary[rcv_id].variation_id].
                            collection_methods))
                    # print 'cmethods:',cmethods #cj_debug
                    rec.info.CLNMETHOD = '|'.join(cmethods)

                found = True
                break

        rec.info.CLNACC = uniq_rcv_ids
        for j, clndbn in enumerate(rec.info.CLNDBN):
            rec.info.CLNDBN[j] = clndbn.replace('\\x2c_',
                                                ',').replace('\\x2c', ',')

        v.write(ostream, rec)

    ostream.close()
    v.stream.close()
    print 'Done.'