def make(WORK, version, outname, mindepth, names): outfile = open(WORK+"/outfiles/"+outname+".vcf", 'w') inloci = WORK+"/outfiles/"+outname+".loci" names = list(names) names.sort() print >>outfile, "##fileformat=VCFv4.1" print >>outfile, "##fileDate="+time.strftime("%Y%m%d") print >>outfile, "##source=pyRAD.v."+str(version) print >>outfile, "##reference=common_allele_at_each_locus" print >>outfile, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">" print >>outfile, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">" print >>outfile, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">" print >>outfile, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">" print >>outfile, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" print >>outfile, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">" print >>outfile, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">" print >>outfile, "\t".join(["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO ","FORMAT"]+list(names)) loci = open(inloci).read().split("|")[:-1] snps = 0 vcflist = [] for locusnumber in range(len(loci)): samps = [i.split()[0][1:] for i in loci[locusnumber].strip().split("\n") if ">" in i] loc = np.array([tuple(i.split()[-1]) for i in loci[locusnumber].strip().split("\n") if ">" in i]) NS = str(len(loc)) DP = str(mindepth) for base in range(len(loc.T)): col = [] site = list(loc.T[base]) site = list("".join(site).replace("-","").replace("N","")) if site: for bb in site: if bb in list("RKYSWM"): col += alignable.unstruct(bb)[0] col += alignable.unstruct(bb)[1] else: col += bb REF = alignable.most_common([i for i in col if i not in list("-RKYSWMN")]) ALT = set([i for i in col if (i in list("ATGC-N")) and (i!=REF)]) if ALT: snps += 1 GENO = [REF]+list(ALT) GENOS = [] for samp in names: if samp in samps: idx = samps.index(samp) f = alignable.unstruct(loc.T[base][idx]) if ('-' in f) or ('N' in f): GENOS.append("./.") else: GENOS.append(str(GENO.index(f[0]))+"|"+str(GENO.index(f[1]))) else: GENOS.append("./.") vcflist.append("\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS", ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS)) if not locusnumber % 1000: outfile.write( "\n".join(vcflist)+"\n" ) vcflist = [] #print >>outfile, "\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS", # ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS) outfile.write( "\n".join(vcflist) ) outfile.close()
def make(WORK, outname, taxadict, minhits): ## output files outfile = gzip.open(WORK + "/outfiles/" + outname + ".treemix.gz", 'w') ## cleanup taxadict to just sample names taxa = OrderedDict() for group in taxadict: taxa[group] = [] for samp in taxadict[group]: a = samp.split("/")[-1].replace(".consens.gz", "") taxa[group].append(a) print "\t data set reduced for group coverage minimums" for i, j in zip(taxa, minhits): print "\t ", i, taxa[i], 'minimum=', j ## read in data from unlinked_snps to sample names infile = open( WORK.rstrip("/") + "/outfiles/" + outname + ".unlinked_snps", 'r') dat = infile.readlines() nsamp, nsnps = dat[0].strip().split(" ") nsamp = int(nsamp) nsnps = int(nsnps) NDATA = np.empty([int(nsamp), int(nsnps)], dtype='object') excludes = 0 ## read SNP matrix into a numpy.array for line in range(len(dat[1:])): a, b = dat[1:][line].split() NDATA[line] = list(b) sites = np.transpose(NDATA) ## unpack ambiguity bases and find two most common alleles ## at every SNP site, save to a list alleles = [] for site in sites: ds = [] for s in site: if s in list("RKSYWM"): ds.append(alignable.unstruct(s)[0]) ds.append(alignable.unstruct(s)[1]) else: ds.append(s) ds.append(s) snp = [s for s in ds if s not in ["N", '-']] a = Counter(snp).most_common(3) alleles.append([a[0][0], a[1][0]]) ## create a dictionary mapping sample names to SNPs SNPS = OrderedDict() for line in dat[1:]: a, b = line.split() SNPS[a] = b ## create a dictionary with empty lists for each taxon FREQ = OrderedDict() for tax in taxa: FREQ[tax] = [] ## fill the FREQ dictionary with SNPs for all ## samples in that taxon keeps = [] for snp in range(int(nsnps)): GG = [] ## if snp meets minhits requirement for tax, mins in zip(taxa, minhits): GG.append( sum([SNPS[i][snp] not in ["N", "-"] for i in taxa[tax]]) >= int(mins)) if all(GG): keeps.append(snp) for keep in keeps: for tax in FREQ: bunch = [] for i in taxa[tax]: bunch.append(alignable.unstruct(SNPS[i][keep])[0]) bunch.append(alignable.unstruct(SNPS[i][keep])[1]) #print tax, i, SNPS[i][keep], bunch FREQ[tax].append("".join(bunch)) ## check that no included taxa have no data # for i,j in zip(taxa,minhits): # if not FREQ[i]: # print "taxon/group ",i,"has no data shared across at least",j,"samples, it must be excluded to build treemix output" ## header print >> outfile, " ".join(FREQ.keys()) ## data to file for i, j in enumerate(keeps): a1 = alleles[j][0] a2 = alleles[j][1] H = [ str(FREQ[tax][i].count(a1)) + "," + str(FREQ[tax][i].count(a2)) for tax in FREQ ] HH = " ".join(H) ## exclude non-biallelic SNPs if " 0,0 " not in HH: ## exclude invariable sites given this sampling if not all([zz.split(",")[1] in '0' for zz in H]): print >> outfile, " ".join(H) else: excludes += 1 outfile.close()
def make(WORK, outname, names, formats, seed, ploidy): np.random.seed(int(seed)) finalfile = open(WORK+"outfiles/"+outname+".loci").read() longname = max(map(len,names)) " output .snps and .unlinked_snps" S = {} ## snp dict Si = {} ## unlinked snp dict for name in list(names): S[name] = [] Si[name] = [] " record bi-allelic snps" nobis = 0 " for each locus select out the SNPs" for loc in finalfile.strip().split("|")[:-1]: pis = "" ns = [] ss = [] cov = {} ## record coverage for each SNP for line in loc.split("\n"): if ">" in line: ns.append(line.split()[0].replace(">","")) ss.append(line.split()[-1]) else: pis = [i[0] for i in enumerate(line) if i[1] in list('*')] # output only potentially informative SNPs " assign snps to S, and record coverage for usnps" for tax in S: if tax in ns: if pis: for snpsite in pis: snpsite -= (longname+5) S[tax].append(ss[ns.index(tax)][snpsite]) if snpsite not in cov: cov[snpsite] = 1 else: cov[snpsite] += 1 "downweight selection of gap sites " if ss[ns.index(tax)][snpsite] != '-': cov[snpsite] += 1 else: if pis: for snpsite in pis: S[tax].append("N") Si[tax].append("N") " randomly select among snps w/ greatest coverage for unlinked snp " maxlist = [] for j,k in cov.items(): if k == max(cov.values()): maxlist.append(j) " Is bi-allelic after resolution of ambigs? " bisnps = [] for maxl in maxlist: bases = [ss[ns.index(tax)][maxl] for tax in S if tax in ns] ambigs = list(chain(*[alignable.unstruct(i) for i in bases if i in "RSWYMK"])) bases = set(bases+ambigs) for ambig in "RSWYMKN-": bases.discard(ambig) if len(bases) <= 2: bisnps.append(maxl) #rando = pis[np.random.randint(len(pis))] #rando -= (longname+5) if bisnps: rando = bisnps[np.random.randint(len(bisnps))] elif maxlist: rando = maxlist[np.random.randint(len(maxlist))] ## record how many loci have no tbi = 0 for tax in S: if tax in ns: if pis: " if none are bi-allelic " if not bisnps: tbi = 1 Si[tax].append(ss[ns.index(tax)][rando]) if pis: " add spacer between loci " S[tax].append(" ") else: " invariable locus " S[tax].append("_ ") nobis += tbi " names " SF = list(S.keys()) SF.sort() " print out .SNP file " if 's' in formats: snpsout = open(WORK+'outfiles/'+outname+".snps",'w') print >>snpsout, "## %s taxa, %s loci, %s snps" % (len(S), len("".join(S.values()[0]).split(" "))-1, len("".join(S[SF[0]]).replace(" ",""))) for i in SF: print >>snpsout, i+(" "*(longname-len(i)+3))+"".join(S[i]) snpsout.close() " print out .USNP file " snpout = open(WORK+'outfiles/'+outname+".unlinked_snps",'w') print >>snpout, len(Si),len("".join(Si.values()[0])) for i in SF: print >>snpout, i+(" "*(longname-len(i)+3))+"".join(Si[i]) snpout.close() statsout = open(WORK+"stats/"+outname+".stats",'a') print >>statsout, "sampled unlinked SNPs=",len(Si.values()[0]) print >>statsout, "sampled unlinked bi-allelic SNPs=", len(Si.values()[0])-nobis statsout.close() if 'k' in formats: "print out .str (structure) file " structout = open(WORK+'outfiles/'+outname+".str", 'w') B = {'A': '0', 'T': '1', 'G': '2', 'C': '3', 'N': '-9', '-': '-9'} if ploidy > 1: for line in SF: print >>structout, line+(" "*(longname-len(line)+3))+\ "\t"*6+"\t".join([B[alignable.unstruct(j)[0]] for j in Si[line]]) print >>structout, line+(" "*(longname-len(line)+3))+\ "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]]) else: for line in SF: print >>structout, line+(" "*(longname-len(line)+3))+\ "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]]) structout.close() if 'g' in formats: "print out .geno file " genoout = open(WORK+'outfiles/'+outname+".usnps.geno", 'w') for i in range(len(Si.values()[0])): getref = 0 ref = "N" while ref == "N": ref = alignable.unstruct(Si[SF[getref]][i])[0] getref += 1 SNProw = "".join(map(str,[alignable.unstruct(Si[j][i]).count(ref) if Si[j][i] != "N" \ else "9" for j in SF])) ## print ref,SNProw if len(set(SNProw)) > 1: print >>genoout, SNProw genoout.close() if 'g' in formats: "print out .geno file " genoout = open(WORK+'outfiles/'+outname+".snps.geno", 'w') for i in range(len(S.values()[0])): if S[SF[0]][i].strip("_").strip(): getref = 0 ref = "N" while ref == "N": #print i, S[SF[0]][i] ref = alignable.unstruct(S[SF[getref]][i])[0] getref += 1 SNProw = "".join(map(str,[alignable.unstruct(S[j][i]).count(ref) if \ S[j][i] != "N" else "9" for j in SF])) ## print ref,SNProw if len(set(SNProw)) > 1: print >>genoout, SNProw genoout.close()
def make(WORK, outname, taxadict, minhits): ## output files outfile = gzip.open(WORK+"/outfiles/"+outname+".treemix.gz",'w') ## cleanup taxadict to just sample names taxa = OrderedDict() for group in taxadict: taxa[group] = [] for samp in taxadict[group]: a = samp.split("/")[-1].replace(".consens.gz","") taxa[group].append(a) print "\t data set reduced for group coverage minimums" for i,j in zip(taxa,minhits): print "\t ",i, taxa[i], 'minimum=',j ## read in data from unlinked_snps to sample names infile = open(WORK.rstrip("/")+"/outfiles/"+outname+".unlinked_snps",'r') dat = infile.readlines() nsamp,nsnps = dat[0].strip().split(" ") nsamp = int(nsamp) nsnps = int(nsnps) NDATA = np.empty([int(nsamp),int(nsnps)],dtype='object') excludes = 0 ## read SNP matrix into a numpy.array for line in range(len(dat[1:])): a,b = dat[1:][line].split() NDATA[line] = list(b) sites = np.transpose(NDATA) ## unpack ambiguity bases and find two most common alleles ## at every SNP site, save to a list alleles = [] for site in sites: ds = [] for s in site: if s in list("RKSYWM"): ds.append(alignable.unstruct(s)[0]) ds.append(alignable.unstruct(s)[1]) else: ds.append(s) ds.append(s) snp = [s for s in ds if s not in ["N",'-']] a = Counter(snp).most_common(3) alleles.append([a[0][0],a[1][0]]) ## create a dictionary mapping sample names to SNPs SNPS = OrderedDict() for line in dat[1:]: a,b = line.split() SNPS[a] = b ## reduce Taxa dict to only samples that are in the unlinkedsnps alignment for key in taxa: replacement = [] for val in taxa[key]: if val in SNPS.keys(): replacement.append(val) taxa[key] = replacement ## create a dictionary with empty lists for each taxon FREQ = OrderedDict() for tax in taxa: FREQ[tax] = [] ## fill the FREQ dictionary with SNPs for all ## samples in that taxon keeps = [] for snp in range(int(nsnps)): GG = [] ## if snp meets minhits requirement for tax,mins in zip(taxa,minhits): GG.append( sum([SNPS[i][snp] not in ["N","-"] for i in taxa[tax]]) >= int(mins)) if all(GG): keeps.append(snp) for keep in keeps: for tax in FREQ: bunch = [] for i in taxa[tax]: bunch.append(alignable.unstruct(SNPS[i][keep])[0]) bunch.append(alignable.unstruct(SNPS[i][keep])[1]) #print tax, i, SNPS[i][keep], bunch FREQ[tax].append("".join(bunch)) ## header print >>outfile, " ".join(FREQ.keys()) ## data to file for i,j in enumerate(keeps): a1 = alleles[j][0] a2 = alleles[j][1] H = [str(FREQ[tax][i].count(a1))+","+str(FREQ[tax][i].count(a2)) for tax in FREQ] HH = " ".join(H) ## exclude non-biallelic SNPs if " 0,0 " not in HH: ## exclude invariable sites given this sampling if not all([zz.split(",")[1] in '0' for zz in H]): print >>outfile, " ".join(H) else: excludes += 1 outfile.close()