예제 #1
0
파일: loci2vcf.py 프로젝트: StuntsPT/pyrad
def make(WORK, version, outname, mindepth, names):
    outfile  =  open(WORK+"/outfiles/"+outname+".vcf", 'w')
    inloci   =  WORK+"/outfiles/"+outname+".loci"
    names = list(names)
    names.sort()

    print >>outfile, "##fileformat=VCFv4.1"
    print >>outfile, "##fileDate="+time.strftime("%Y%m%d")
    print >>outfile, "##source=pyRAD.v."+str(version)
    print >>outfile, "##reference=common_allele_at_each_locus"
    print >>outfile, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"
    print >>outfile, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">"
    print >>outfile, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">"
    print >>outfile, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">"
    print >>outfile, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
    print >>outfile, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">"
    print >>outfile, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">"
    print >>outfile, "\t".join(["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO    ","FORMAT"]+list(names))

    loci = open(inloci).read().split("|")[:-1]
    snps = 0
    vcflist = []
    for locusnumber in range(len(loci)):
        samps = [i.split()[0][1:] for i in loci[locusnumber].strip().split("\n") if ">" in i]
        loc = np.array([tuple(i.split()[-1]) for i in loci[locusnumber].strip().split("\n") if ">" in i])
        NS = str(len(loc))
        DP = str(mindepth)
        for base in range(len(loc.T)):
            col = []
            site = list(loc.T[base])
            site = list("".join(site).replace("-","").replace("N",""))
            if site:
                for bb in site:
                    if bb in list("RKYSWM"):
                        col += alignable.unstruct(bb)[0]
                        col += alignable.unstruct(bb)[1]
                    else:
                        col += bb
                REF = alignable.most_common([i for i in col if i not in list("-RKYSWMN")])
                ALT = set([i for i in col if (i in list("ATGC-N")) and (i!=REF)])
                if ALT:
                    snps += 1
                    GENO = [REF]+list(ALT)
                    GENOS = []
                    for samp in names:
                        if samp in samps:
                            idx = samps.index(samp)
                            f = alignable.unstruct(loc.T[base][idx])
                            if ('-' in f) or ('N' in f):
                                GENOS.append("./.")
                            else:
                                GENOS.append(str(GENO.index(f[0]))+"|"+str(GENO.index(f[1])))
                        else:
                            GENOS.append("./.")
                    vcflist.append("\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS",
                                              ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS))
        if not locusnumber % 1000:
            outfile.write( "\n".join(vcflist)+"\n" )
            vcflist = []
                                              
                    #print >>outfile, "\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS",
                    #                            ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS)
    

    outfile.write( "\n".join(vcflist) )
    outfile.close()
예제 #2
0
def make(WORK, outname, taxadict, minhits):

    ## output files
    outfile = gzip.open(WORK + "/outfiles/" + outname + ".treemix.gz", 'w')

    ## cleanup taxadict to just sample names
    taxa = OrderedDict()
    for group in taxadict:
        taxa[group] = []
        for samp in taxadict[group]:
            a = samp.split("/")[-1].replace(".consens.gz", "")
            taxa[group].append(a)

    print "\t    data set reduced for group coverage minimums"
    for i, j in zip(taxa, minhits):
        print "\t   ", i, taxa[i], 'minimum=', j

    ## read in data from unlinked_snps to sample names
    infile = open(
        WORK.rstrip("/") + "/outfiles/" + outname + ".unlinked_snps", 'r')
    dat = infile.readlines()
    nsamp, nsnps = dat[0].strip().split(" ")
    nsamp = int(nsamp)
    nsnps = int(nsnps)
    NDATA = np.empty([int(nsamp), int(nsnps)], dtype='object')
    excludes = 0

    ## read SNP matrix into a numpy.array
    for line in range(len(dat[1:])):
        a, b = dat[1:][line].split()
        NDATA[line] = list(b)
    sites = np.transpose(NDATA)

    ## unpack ambiguity bases and find two most common alleles
    ## at every SNP site, save to a list
    alleles = []
    for site in sites:
        ds = []
        for s in site:
            if s in list("RKSYWM"):
                ds.append(alignable.unstruct(s)[0])
                ds.append(alignable.unstruct(s)[1])
            else:
                ds.append(s)
                ds.append(s)
        snp = [s for s in ds if s not in ["N", '-']]
        a = Counter(snp).most_common(3)
        alleles.append([a[0][0], a[1][0]])

    ## create a dictionary mapping sample names to SNPs
    SNPS = OrderedDict()
    for line in dat[1:]:
        a, b = line.split()
        SNPS[a] = b

    ## create a dictionary with empty lists for each taxon
    FREQ = OrderedDict()
    for tax in taxa:
        FREQ[tax] = []

    ## fill the FREQ dictionary with SNPs for all
    ## samples in that taxon
    keeps = []
    for snp in range(int(nsnps)):
        GG = []
        ## if snp meets minhits requirement
        for tax, mins in zip(taxa, minhits):
            GG.append(
                sum([SNPS[i][snp] not in ["N", "-"]
                     for i in taxa[tax]]) >= int(mins))
        if all(GG):
            keeps.append(snp)

    for keep in keeps:
        for tax in FREQ:
            bunch = []
            for i in taxa[tax]:
                bunch.append(alignable.unstruct(SNPS[i][keep])[0])
                bunch.append(alignable.unstruct(SNPS[i][keep])[1])
                #print tax, i, SNPS[i][keep], bunch
            FREQ[tax].append("".join(bunch))

    ## check that no included taxa have no data
    # for i,j in zip(taxa,minhits):
    #     if not FREQ[i]:
    #         print "taxon/group ",i,"has no data shared across at least",j,"samples, it must be excluded to build treemix output"

    ## header
    print >> outfile, " ".join(FREQ.keys())

    ## data to file
    for i, j in enumerate(keeps):
        a1 = alleles[j][0]
        a2 = alleles[j][1]
        H = [
            str(FREQ[tax][i].count(a1)) + "," + str(FREQ[tax][i].count(a2))
            for tax in FREQ
        ]
        HH = " ".join(H)

        ## exclude non-biallelic SNPs
        if " 0,0 " not in HH:
            ## exclude invariable sites given this sampling
            if not all([zz.split(",")[1] in '0' for zz in H]):
                print >> outfile, " ".join(H)
        else:
            excludes += 1

    outfile.close()
예제 #3
0
def make(WORK, outname, names, formats, seed, ploidy):
    np.random.seed(int(seed))
    finalfile = open(WORK+"outfiles/"+outname+".loci").read()
    longname = max(map(len,names))

    " output .snps and .unlinked_snps"
    S = {}      ## snp dict
    Si = {}     ## unlinked snp dict
    for name in list(names):
        S[name] = []
        Si[name] = []

    " record bi-allelic snps"
    nobis = 0

    " for each locus select out the SNPs"
    for loc in finalfile.strip().split("|")[:-1]:
        pis = ""
        ns = []
        ss = []
        cov = {}  ## record coverage for each SNP
        for line in loc.split("\n"):
            if ">" in line:
                ns.append(line.split()[0].replace(">",""))
                ss.append(line.split()[-1])
            else:
                pis = [i[0] for i in enumerate(line) if i[1] in list('*')] # output only potentially informative SNPs
                
        " assign snps to S, and record coverage for usnps"
        for tax in S:
            if tax in ns:
                if pis:
                    for snpsite in pis:
                        snpsite -= (longname+5)
                        S[tax].append(ss[ns.index(tax)][snpsite])
                        if snpsite not in cov:
                            cov[snpsite] = 1
                        else:
                            cov[snpsite] += 1
                        "downweight selection of gap sites "
                        if ss[ns.index(tax)][snpsite] != '-':
                           cov[snpsite] += 1
            else:
                if pis:
                    for snpsite in pis:
                        S[tax].append("N")
                    Si[tax].append("N")

        " randomly select among snps w/ greatest coverage for unlinked snp "
        maxlist = []
        for j,k in cov.items():
            if k == max(cov.values()):
                maxlist.append(j)

        " Is bi-allelic after resolution of ambigs? "
        bisnps = []
        for maxl in maxlist:
            bases = [ss[ns.index(tax)][maxl] for tax in S if tax in ns]
            ambigs = list(chain(*[alignable.unstruct(i) for i in bases if i in "RSWYMK"]))
            bases = set(bases+ambigs)
            for ambig in "RSWYMKN-":
                bases.discard(ambig)
            if len(bases) <= 2:
                bisnps.append(maxl)

        #rando = pis[np.random.randint(len(pis))]
        #rando -= (longname+5)
        if bisnps:
            rando = bisnps[np.random.randint(len(bisnps))]
        elif maxlist:
            rando = maxlist[np.random.randint(len(maxlist))]

        ## record how many loci have no 
        tbi = 0
        for tax in S:
            if tax in ns:
                if pis:
                    " if none are bi-allelic "
                    if not bisnps:
                        tbi = 1
                    Si[tax].append(ss[ns.index(tax)][rando])
            if pis:
                " add spacer between loci "                
                S[tax].append(" ")
            else:
                " invariable locus "
                S[tax].append("_ ")
        nobis += tbi
    " names "
    SF = list(S.keys())
    SF.sort()

    " print out .SNP file "
    if 's' in formats:
        snpsout = open(WORK+'outfiles/'+outname+".snps",'w')
        print >>snpsout, "## %s taxa, %s loci, %s snps" % (len(S),
                                                           len("".join(S.values()[0]).split(" "))-1,
                                                           len("".join(S[SF[0]]).replace(" ","")))
        for i in SF:
            print >>snpsout, i+(" "*(longname-len(i)+3))+"".join(S[i])
        snpsout.close()


    " print out .USNP file "
    snpout = open(WORK+'outfiles/'+outname+".unlinked_snps",'w')
    print >>snpout, len(Si),len("".join(Si.values()[0]))
    for i in SF:
        print >>snpout, i+(" "*(longname-len(i)+3))+"".join(Si[i])
    snpout.close()

    statsout  = open(WORK+"stats/"+outname+".stats",'a')
    print >>statsout, "sampled unlinked SNPs=",len(Si.values()[0])
    print >>statsout, "sampled unlinked bi-allelic SNPs=", len(Si.values()[0])-nobis
    statsout.close()

    if 'k' in formats:
        "print out .str (structure) file "
        structout = open(WORK+'outfiles/'+outname+".str", 'w')
        
        B = {'A': '0',
             'T': '1',
             'G': '2',
             'C': '3',
             'N': '-9',
             '-': '-9'}
        if ploidy > 1:
            for line in SF:
                print >>structout, line+(" "*(longname-len(line)+3))+\
                      "\t"*6+"\t".join([B[alignable.unstruct(j)[0]] for j in Si[line]])
                print >>structout, line+(" "*(longname-len(line)+3))+\
                      "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]])
        else:
            for line in SF:
                print >>structout, line+(" "*(longname-len(line)+3))+\
                      "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]])
        structout.close()


    if 'g' in formats:
        "print out .geno file "
        genoout = open(WORK+'outfiles/'+outname+".usnps.geno", 'w')
        for i in range(len(Si.values()[0])):
            getref = 0
            ref = "N"
            while ref == "N":
                ref = alignable.unstruct(Si[SF[getref]][i])[0]
                getref += 1
            SNProw = "".join(map(str,[alignable.unstruct(Si[j][i]).count(ref) if Si[j][i] != "N" \
                                      else "9" for j in SF]))
            ## print ref,SNProw
            if len(set(SNProw)) > 1:
                print >>genoout, SNProw 
        genoout.close()

    if 'g' in formats:
        "print out .geno file "
        genoout = open(WORK+'outfiles/'+outname+".snps.geno", 'w')
        for i in range(len(S.values()[0])):
            if S[SF[0]][i].strip("_").strip():
                getref = 0
                ref = "N"
                while ref == "N":
                    #print i, S[SF[0]][i]
                    ref = alignable.unstruct(S[SF[getref]][i])[0]
                    getref += 1
                    SNProw = "".join(map(str,[alignable.unstruct(S[j][i]).count(ref) if \
                                              S[j][i] != "N" else "9" for j in SF]))
                ## print ref,SNProw
                if len(set(SNProw)) > 1:
                    print >>genoout, SNProw 
        genoout.close()
예제 #4
0
def make(WORK, outname, taxadict, minhits):

    ## output files
    outfile = gzip.open(WORK+"/outfiles/"+outname+".treemix.gz",'w')

    ## cleanup taxadict to just sample names
    taxa = OrderedDict()
    for group in taxadict:
        taxa[group] = []
        for samp in taxadict[group]:
            a = samp.split("/")[-1].replace(".consens.gz","")
            taxa[group].append(a)

    print "\t    data set reduced for group coverage minimums"        
    for i,j in zip(taxa,minhits):
        print "\t   ",i, taxa[i], 'minimum=',j
    
    ## read in data from unlinked_snps to sample names
    infile = open(WORK.rstrip("/")+"/outfiles/"+outname+".unlinked_snps",'r')
    dat = infile.readlines()
    nsamp,nsnps = dat[0].strip().split(" ")
    nsamp = int(nsamp)
    nsnps = int(nsnps)
    NDATA = np.empty([int(nsamp),int(nsnps)],dtype='object')
    excludes = 0

    ## read SNP matrix into a numpy.array
    for line in range(len(dat[1:])):
        a,b = dat[1:][line].split()
        NDATA[line] = list(b)
    sites = np.transpose(NDATA)

    ## unpack ambiguity bases and find two most common alleles
    ## at every SNP site, save to a list
    alleles = []
    for site in sites:
        ds = []
        for s in site:
            if s in list("RKSYWM"):
                ds.append(alignable.unstruct(s)[0])
                ds.append(alignable.unstruct(s)[1])
            else:
                ds.append(s)
                ds.append(s)
        snp = [s for s in ds if s not in ["N",'-']]
        a = Counter(snp).most_common(3)
        alleles.append([a[0][0],a[1][0]])

    ## create a dictionary mapping sample names to SNPs    
    SNPS = OrderedDict()
    for line in dat[1:]:
        a,b = line.split()
        SNPS[a] = b

    ## reduce Taxa dict to only samples that are in the unlinkedsnps alignment
    for key in taxa:
        replacement = []
        for val in taxa[key]:
            if val in SNPS.keys():
                replacement.append(val)
        taxa[key] = replacement

    ## create a dictionary with empty lists for each taxon 
    FREQ = OrderedDict()
    for tax in taxa:
        FREQ[tax] = []

    ## fill the FREQ dictionary with SNPs for all 
    ## samples in that taxon
    keeps = []
    for snp in range(int(nsnps)):
        GG = []
        ## if snp meets minhits requirement
        for tax,mins in zip(taxa,minhits):
            GG.append( sum([SNPS[i][snp] not in ["N","-"] for i in taxa[tax]]) >= int(mins))
        if all(GG):
            keeps.append(snp)


    for keep in keeps:
        for tax in FREQ:
            bunch = []
            for i in taxa[tax]:
                bunch.append(alignable.unstruct(SNPS[i][keep])[0])
                bunch.append(alignable.unstruct(SNPS[i][keep])[1])
                #print tax, i, SNPS[i][keep], bunch
            FREQ[tax].append("".join(bunch))

    ## header
    print >>outfile, " ".join(FREQ.keys())

    ## data to file
    for i,j in enumerate(keeps):
        a1 = alleles[j][0]
        a2 = alleles[j][1]
        H = [str(FREQ[tax][i].count(a1))+","+str(FREQ[tax][i].count(a2)) for tax in FREQ]
        HH = " ".join(H)

        ## exclude non-biallelic SNPs
        if " 0,0 " not in HH:
            ## exclude invariable sites given this sampling
            if not all([zz.split(",")[1] in '0' for zz in H]):
                print >>outfile, " ".join(H)
        else:
            excludes += 1

    outfile.close()