def parseFile(ifile): #o = open(ifile+'.burttii_filtered','w') first_line = True global HEADER count = 0 #print 'Marksers used: ',count HEADER='CHROM\tPOS\t' string='Burtii_20130605 Gifu_20130609 MG20_genomic_20130609 mg004 mg010 mg012 mg019 \ mg023 mg036 mg049 mg051 mg062 mg072 mg073 mg077 mg080 mg082 mg083 mg086 mg089\ mg093 mg095 mg097 mg101 mg107 mg109 mg112 mg113 mg118 mg123 mg128' for i in string.split(): HEADER += i+'_GC\t' + i+'_GQ\t' + i+'_DP\t' print HEADER for line in open(ifile,'r'): if len(line) > 1 and not line.startswith('#'): obj = classVCF.VCF(line) genotypes = obj.genotypes() ### check if the MG20 is 0/0 reference Homozygous if obj.genotype(2) != '0/1' and len(obj.alts())==1 and len(obj.refs())==1: #o.write(line+'\n') count += 1 genotypes = obj.genotypes() string = obj.chroms()+'\t'+str(obj.poss()) + '\t' for i in range(len(genotypes)): if obj.genotype(i) != 'NONE': string += obj.genotype(i) + '\t' + str(obj.genotypeQual(i)) + '\t' + str(obj.genotypeDepth(i)) + '\t' else: string += obj.genotype(i) + '\t' + '0' + '\t' + '0' + '\t' print string
def parse(): count = 0 then = time.time() for line in open(ifile, 'r'): if len(line) > 0 and not line.startswith('##'): line = line.strip() obj = classVCF.VCF(line) count += 1 if count % 10000 == 0: diff = time.time() - then minutes, seconds = int(diff) / 60, diff % 60 print 'Number of markers processed: ', '{:9,.0f}'.format(count) print('Time taken Min:Sec ==> ' + str(minutes) + ':' + str(round(seconds, 2))) if line.startswith('#'): genotypes = obj.genotypes() g_count = len(genotypes) dist_mat = {} for i in range(g_count): for j in range(g_count): dist_mat[i, j] = 0 else: for i in range(g_count): for j in range(g_count): geno1 = obj.genotype(i) geno2 = obj.genotype(j) dist_mat[i, j] += calc_dist(geno1, geno2) printOut(dist_mat, genotypes)
def parseFile(ifile): o = open(ifile + '.MG20filtered', 'w') global HEADER count = 0 for line in open(ifile, 'r'): if len(line) > 1 and not line.startswith('#'): line = line.strip('\n') obj = classVCF.VCF(line) genotypes = obj.genotypes() ### check if the MG20 is 0/0 reference Homozygous if obj.genotype(2) == '0/0': o.write(line + '\n') o.close()
def makeFasta(): seqs = {} count = 0 for line in open(ifile, 'r'): if line.startswith('#CHROM'): line = line.strip('\n') obj = classVCF.VCF(line) genotypes_names = obj.genotypes() if len(line) > 1 and not line.startswith('#'): line = line.strip('\n') obj = classVCF.VCF(line) genotypes = obj.genotypes() count += 1 if count % 100000 == 0: print 'Number of markers processed: ', '{:9,.0f}'.format(count) ### check if the burttii is not 0/1 heterogyzous for i in range(len(genotypes)): if i in seqs: if obj.genotype(i) == '0/0': seqs[i] += obj.refs() elif obj.genotype(i) == '1/1': seqs[i] += obj.alts() else: seqs[i] += 'N' else: if obj.genotype(i) == '0/0': seqs[i] = obj.refs() elif obj.genotype(i) == '1/1': seqs[i] = obj.alts() else: seqs[i] = 'N' printFasta(seqs, genotypes_names)
def parseFile(ifile): o = open(ifile+'MoreThanMG20coverage','w') global HEADER count = 0 for line in open(ifile,'r'): if len(line) > 1 and not line.startswith('#'): line = line.strip('\n') obj = classVCF.VCF(line) genotypes = obj.genotypes() ### check if the MG20 is 0/0 reference Homozygous if int(obj.genotypeDepthSUM()) > int(obj.genotypeDepth(2)): ### check if its not only MG20 coverage if obj.genotype(2) == '0/0': o.write(line+'\n') count += 1 print 'Total positions printed:', count o.close()
def parse_call(chromosome, callable_file): hash = {} count = 0 for line in open(callable_file, 'r'): if len(line) > 1 and not line.startswith('#'): count += 1 ### print the lines processed if count % 100000 == 0: print 'Lines processed in callable fraction file: ', chromosome, '{:9,.0f}'.format( count) if tokens[0] == chromosome: line = line.strip() tokens = line.split('\t') obj = classVCF.VCF(line) pos = int(tokens[1]) hash[pos] = str(obj.depth()) return hash
def parseFile(ifile): o = open(ifile + '.MG20filtered', 'w') global HEADER count = 0 for line in open(ifile, 'r'): if len(line) > 1 and not line.startswith('##'): line = line.strip('\n') if line.startswith('#CHROM'): o.write(line + '\n') HEADER = line samples_het = [] samples_homo = [] sample_names = line.split('\t')[9:] samples_len = len(line.split('\t')) - 9 for i in range(samples_len): samples_het.append(0) samples_homo.append(0) else: obj = classVCF.VCF(line) genotypes = obj.genotypes() ### check if the MG20 is 0/0 reference Homozygous if obj.genotype(2) == '0/0': o.write(line + '\n') count += 1 genotypes = obj.genotypes() for i in range(len(genotypes)): if obj.genotype(i) == '0/1' or obj.genotype( i) == '1/0': samples_het[i] += 1 elif obj.genotype(i) == '0/0' or obj.genotype( i) == '1/1': samples_homo[i] += 1 print 'Marksers used: ', count print 'Sample\tHetCount\tHomoCount\tHetPer\tHomoPer' for i in range(len(sample_names)): total = int(samples_het[i]) + int(samples_homo[i]) Het_per = float(samples_het[i]) / total Homo_per = float(samples_homo[i]) / total print sample_names[i] + '\t' + str(samples_het[i]) + '\t' + str( samples_homo[i]) + '\t' + str(Het_per) + '\t' + str(Homo_per) o.close()
def parseFile(ifile): o = open(ifile+'.NoBurttiiGifu','w') global HEADER count = 0 for line in open(ifile,'r'): if len(line) > 1 and not line.startswith('#'): line = line.strip('\n') obj = classVCF.VCF(line) genotypes = obj.genotypes() write_flag = False ### check if the burttii is not 0/1 heterogyzous for i in range(3,len(genotypes)): if obj.genotype(i) == '0/1': write_flag = True if write_flag == True: o.write(line+'\n') count += 1 print 'Total positions printed:', count o.close()
def parseFile(ifile): o = open(ifile + '.tbl', 'w') global HEADER HEADER = '#Chromosome\tPosition\tCallable\tSNP\tDepth\tgenotypeCalls\tgenotypeCallsHete\tgenotypeCallsHomo\tInbreedingCoeffs\tHaplotypeScores' count = 0 o.write(HEADER + '\n') for line in open(ifile, 'r'): if len(line) > 1 and not line.startswith('#'): line = line.strip('\n') obj = classVCF.VCF(line) genotypes = obj.genotypes() ''' ### check if the MG20 is 0/0 reference Homozygous if int(obj.genotypeDepthSUM()) > int(obj.genotypeDepth(2)): ### check if its not only MG20 coverage if obj.genotype(2) == '0/0': o.write(line+'\n') count += 1 ''' line = str(obj.chroms())+'\t'+str(obj.poss())+'\t'+ '1' + '\t' + str(obj.variants()) + '\t' + str(obj.depth()) + '\t'+\ str(obj.genotypeCalls()) + '\t' + str(obj.genotypeCallsHete()) + '\t' + str(obj.genotypeCallsHomo()) + '\t' +\ str(obj.InbreedingCoeffs()) + '\t' + str(obj.HaplotypeScores()) o.write(line + '\n') o.close()