Python VCF Examples, classVCF.VCF Python Examples

Example #1

0

Show file

File: 119_vcfParser_all_f.py Project: vikas0633/python

def parseFile(ifile):
    #o = open(ifile+'.burttii_filtered','w')
    first_line = True
    global HEADER
    count = 0
    #print 'Marksers used: ',count
    HEADER='CHROM\tPOS\t'
    string='Burtii_20130605	Gifu_20130609	MG20_genomic_20130609	mg004	mg010	mg012	mg019	\
    mg023	mg036	mg049	mg051	mg062	mg072	mg073	mg077	mg080	mg082	mg083	mg086	mg089\
	    mg093	mg095	mg097	mg101	mg107	mg109	mg112	mg113	mg118	mg123	mg128'
    
    
    for i in string.split():
	HEADER += i+'_GC\t' + i+'_GQ\t' + i+'_DP\t'

    print HEADER

    for line in open(ifile,'r'):
	if len(line) > 1 and not line.startswith('#'):
	    obj = classVCF.VCF(line)
	    genotypes = obj.genotypes()
	    ### check if the MG20 is 0/0 reference Homozygous
	    if obj.genotype(2) != '0/1' and len(obj.alts())==1 and len(obj.refs())==1:
		#o.write(line+'\n')
		count += 1
		genotypes = obj.genotypes()
		
		string = obj.chroms()+'\t'+str(obj.poss()) + '\t'
		for i in range(len(genotypes)):
		    if obj.genotype(i) != 'NONE':
			string += obj.genotype(i) + '\t' + str(obj.genotypeQual(i)) + '\t' +  str(obj.genotypeDepth(i)) + '\t'
		    else:
			string += obj.genotype(i) + '\t' + '0' + '\t' +  '0' + '\t'
		print string

Example #2

0

Show file

File: 21bc_GenotypicDistance.py Project: vikas0633/python

def parse():
    count = 0
    then = time.time()
    for line in open(ifile, 'r'):
        if len(line) > 0 and not line.startswith('##'):
            line = line.strip()
            obj = classVCF.VCF(line)

            count += 1
            if count % 10000 == 0:
                diff = time.time() - then
                minutes, seconds = int(diff) / 60, diff % 60
                print 'Number of markers processed: ', '{:9,.0f}'.format(count)
                print('Time taken Min:Sec ==> ' + str(minutes) + ':' +
                      str(round(seconds, 2)))

            if line.startswith('#'):
                genotypes = obj.genotypes()
                g_count = len(genotypes)
                dist_mat = {}
                for i in range(g_count):
                    for j in range(g_count):
                        dist_mat[i, j] = 0
            else:
                for i in range(g_count):
                    for j in range(g_count):
                        geno1 = obj.genotype(i)
                        geno2 = obj.genotype(j)
                        dist_mat[i, j] += calc_dist(geno1, geno2)

    printOut(dist_mat, genotypes)

Example #3

0

Show file

def parseFile(ifile):
    o = open(ifile + '.MG20filtered', 'w')
    global HEADER
    count = 0
    for line in open(ifile, 'r'):
        if len(line) > 1 and not line.startswith('#'):
            line = line.strip('\n')
            obj = classVCF.VCF(line)
            genotypes = obj.genotypes()
            ### check if the MG20 is 0/0 reference Homozygous
            if obj.genotype(2) == '0/0':
                o.write(line + '\n')
    o.close()

Example #4

0

Show file

def makeFasta():
    seqs = {}
    count = 0
    for line in open(ifile, 'r'):
        if line.startswith('#CHROM'):
            line = line.strip('\n')
            obj = classVCF.VCF(line)
            genotypes_names = obj.genotypes()

        if len(line) > 1 and not line.startswith('#'):
            line = line.strip('\n')
            obj = classVCF.VCF(line)
            genotypes = obj.genotypes()

            count += 1
            if count % 100000 == 0:
                print 'Number of markers processed: ', '{:9,.0f}'.format(count)

            ### check if the burttii is not 0/1 heterogyzous
            for i in range(len(genotypes)):
                if i in seqs:
                    if obj.genotype(i) == '0/0':
                        seqs[i] += obj.refs()
                    elif obj.genotype(i) == '1/1':
                        seqs[i] += obj.alts()
                    else:
                        seqs[i] += 'N'
                else:
                    if obj.genotype(i) == '0/0':
                        seqs[i] = obj.refs()
                    elif obj.genotype(i) == '1/1':
                        seqs[i] = obj.alts()
                    else:
                        seqs[i] = 'N'

    printFasta(seqs, genotypes_names)

Example #5

0

Show file

def parseFile(ifile):
    o = open(ifile+'MoreThanMG20coverage','w')
    global HEADER
    count = 0
    for line in open(ifile,'r'):
        if len(line) > 1 and not line.startswith('#'):
            line = line.strip('\n')
            obj = classVCF.VCF(line)
            genotypes = obj.genotypes()
            ### check if the MG20 is 0/0 reference Homozygous
            if int(obj.genotypeDepthSUM()) > int(obj.genotypeDepth(2)):  ### check if its not only MG20 coverage
                if obj.genotype(2) == '0/0':
                    o.write(line+'\n')
                    count += 1
    print 'Total positions printed:', count
    o.close()

Example #6

0

Show file

def parse_call(chromosome, callable_file):
    hash = {}
    count = 0
    for line in open(callable_file, 'r'):
        if len(line) > 1 and not line.startswith('#'):

            count += 1
            ### print the lines processed
            if count % 100000 == 0:
                print 'Lines processed in callable fraction file: ', chromosome, '{:9,.0f}'.format(
                    count)
            if tokens[0] == chromosome:
                line = line.strip()
                tokens = line.split('\t')
                obj = classVCF.VCF(line)
                pos = int(tokens[1])
                hash[pos] = str(obj.depth())
    return hash

Example #7

0

Show file

def parseFile(ifile):
    o = open(ifile + '.MG20filtered', 'w')
    global HEADER
    count = 0
    for line in open(ifile, 'r'):
        if len(line) > 1 and not line.startswith('##'):
            line = line.strip('\n')
            if line.startswith('#CHROM'):
                o.write(line + '\n')
                HEADER = line
                samples_het = []
                samples_homo = []
                sample_names = line.split('\t')[9:]
                samples_len = len(line.split('\t')) - 9
                for i in range(samples_len):
                    samples_het.append(0)
                    samples_homo.append(0)
            else:
                obj = classVCF.VCF(line)
                genotypes = obj.genotypes()
                ### check if the MG20 is 0/0 reference Homozygous
                if obj.genotype(2) == '0/0':
                    o.write(line + '\n')
                    count += 1
                    genotypes = obj.genotypes()
                    for i in range(len(genotypes)):
                        if obj.genotype(i) == '0/1' or obj.genotype(
                                i) == '1/0':
                            samples_het[i] += 1
                        elif obj.genotype(i) == '0/0' or obj.genotype(
                                i) == '1/1':
                            samples_homo[i] += 1
    print 'Marksers used: ', count
    print 'Sample\tHetCount\tHomoCount\tHetPer\tHomoPer'

    for i in range(len(sample_names)):
        total = int(samples_het[i]) + int(samples_homo[i])
        Het_per = float(samples_het[i]) / total
        Homo_per = float(samples_homo[i]) / total
        print sample_names[i] + '\t' + str(samples_het[i]) + '\t' + str(
            samples_homo[i]) + '\t' + str(Het_per) + '\t' + str(Homo_per)
    o.close()

Example #8

0

Show file

def parseFile(ifile):
    o = open(ifile+'.NoBurttiiGifu','w')
    global HEADER
    count = 0
    for line in open(ifile,'r'):
        if len(line) > 1 and not line.startswith('#'):
            line = line.strip('\n')
            obj = classVCF.VCF(line)
            genotypes = obj.genotypes()
            write_flag = False
            ### check if the burttii is not 0/1 heterogyzous
            for i in range(3,len(genotypes)):
                if obj.genotype(i) == '0/1':
                    write_flag = True
            
            if write_flag == True:
                o.write(line+'\n')
                count += 1
    print 'Total positions printed:', count
    o.close()

Example #9

0

Show file

File: 119d_vcfParser.py Project: vikas0633/python

def parseFile(ifile):
    o = open(ifile + '.tbl', 'w')
    global HEADER
    HEADER = '#Chromosome\tPosition\tCallable\tSNP\tDepth\tgenotypeCalls\tgenotypeCallsHete\tgenotypeCallsHomo\tInbreedingCoeffs\tHaplotypeScores'
    count = 0
    o.write(HEADER + '\n')
    for line in open(ifile, 'r'):
        if len(line) > 1 and not line.startswith('#'):
            line = line.strip('\n')
            obj = classVCF.VCF(line)
            genotypes = obj.genotypes()
            '''
            ### check if the MG20 is 0/0 reference Homozygous
            if int(obj.genotypeDepthSUM()) > int(obj.genotypeDepth(2)):  ### check if its not only MG20 coverage
                if obj.genotype(2) == '0/0':
                    o.write(line+'\n')
                    count += 1
            '''
            line = str(obj.chroms())+'\t'+str(obj.poss())+'\t'+ '1' + '\t' + str(obj.variants()) + '\t' + str(obj.depth()) + '\t'+\
            str(obj.genotypeCalls()) + '\t' + str(obj.genotypeCallsHete()) + '\t' + str(obj.genotypeCallsHomo()) + '\t' +\
            str(obj.InbreedingCoeffs()) + '\t' + str(obj.HaplotypeScores())
            o.write(line + '\n')

    o.close()