Example #1
0
 def __init__(self, refaln, type=""):
     if type == "fasta":
         self.aln = SeqGroup(sequences=refaln)
     else:
         self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed')
     self.true_spe = {}
     self._get_truth()
     self._get_cluster_label()
Example #2
0
File: msa.py Project: epruesse/ARB
 def get_hmm_refalignment(self):
     sites = []
     hmp = open(self.refprofile)
     l = hmp.readline()
     start = False
     while l != "":
         if l.startswith("//"):
             break
         if start:
             l = l.strip()
             ll = l.split()
             usedsite = int(ll[5])
             sites.append(usedsite)
             l = hmp.readline()
             l = hmp.readline()
         else:
             if l.startswith("HMM "):
                 start = True
                 l = hmp.readline()
                 l = hmp.readline()
                 l = hmp.readline()
                 l = hmp.readline()
         l = hmp.readline()
     hmp.close()
     align = SeqGroup(self.refalign)
     fout = open(self.trimed, "w")
     for entr in align.get_entries():
         fout.write(">" + entr[0] + "\n")
         for pos in sites:
             fout.write(entr[1][pos - 1])
         fout.write("\n")
     fout.close()
     return self.trimed, len(sites)
Example #3
0
def gentesting(ftaxa, fseq, fout, fold = 10):
    ftax = open(ftaxa)
    lines = ftax.readlines()
    ftax.close()
    
    #seqs = SeqGroup(fseq, format='phylip_relaxed')
    seqs = SeqGroup(fseq)
    
    idx = range(len(lines))
    random.seed(12345)
    random.shuffle(idx)
    
    numtaxa = len(lines)
    onefold = int(math.ceil(float(numtaxa) / fold))
    
    idx_list = []
    for i in range(fold):
        start = i * onefold
        end = (i + 1) * onefold
        if end > numtaxa:
            end = numtaxa
        if i == fold -1 :
            end = numtaxa
        idx_list.append(idx[start:end])
    
    for i in range(len(idx_list)):
        idxi = idx_list[i]
        f1 = open(fout + repr(i+1) + "testing.tax", "w")
        f2 = open(fout + repr(i+1) + "testing.fa", "w")
        for index in idxi:
             tax = lines[index]
             seqid = tax.split()[0]
             seq = seqs.get_seq(seqid)
             seqnogap = seq.replace("-","")
             f1.write(tax)
             f2.write(">" + seqid + "\n")
             f2.write(seqnogap + "\n")
        f1.close()
        f2.close()
        
        f1 = open(fout + repr(i+1) + "training.tax", "w")
        f2 = open(fout + repr(i+1) + "training.afa", "w")
        f3 = open(fout + repr(i+1) + "training.fa", "w")
        
        for j in range(len(idx_list)):
            if not i==j:
                idxj = idx_list[j]
                for index in idxj:
                    tax = lines[index]
                    seqid = tax.split()[0]
                    seq = seqs.get_seq(seqid)
                    seqnogap = seq.replace("-","")
                    f1.write(tax)
                    f2.write(">" + seqid + "\n")
                    f2.write(seq + "\n")
                    f3.write(">" + seqid + "\n")
                    f3.write(seqnogap + "\n")
        f1.close()
        f2.close()
        f3.close()
Example #4
0
def curator(refseq, reftax, method, output, testingtax=""):
    seqs = SeqGroup(refseq)
    ranks = []
    with open(reftax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            ranks.append(ele)
    testings = []
    if testingtax != "":
        with open(testingtax) as fo:
            for line in fo:
                ll = line.split()
                ele = [ll[0], ll[1].split(";")]
                testings.append(ele)
    else:
        testings = ranks

    for test in testings:
        #refseq, reftax, name, old_tax, method, foutput
        ru, result_string = findmis(refseq=seqs,
                                    reftax=ranks,
                                    name=test[0],
                                    method=method,
                                    foutput=output)
        print(result_string)
Example #5
0
File: msa.py Project: epruesse/ARB
def merge_alignment(aln1, aln2, fout, numsites):
    seqs1 = SeqGroup(aln1)
    seqs2 = SeqGroup(aln2)
    if len(seqs1) == 0 or len(seqs2) == 0:
        print("No sequences aligned! ")
        sys.exit()
    with open(fout, "w") as fo:
        for seq in seqs1.iter_entries():
            if len(seq[1].strip()) == numsites:
                fo.write(">" + seq[0] + "\n" + seq[1] + "\n")
            else:
                print("Error in alignment ....")
                sys.exit()
        for seq in seqs2.iter_entries():
            if len(seq[1].strip()) == numsites:
                fo.write(">" + seq[0] + "\n" + seq[1] + "\n")
            else:
                print("Error in alignment ....")
                sys.exit()
Example #6
0
 def __write_algn(self, fullpath):
     """
     to write algn in paml format
     """
     seq_group = SeqGroup()
     for n in self:
         seq_group.id2seq[n.node_id] = n.nt_sequence
         seq_group.id2name[n.node_id] = n.name
         seq_group.name2id[n.name] = n.node_id
     seq_group.write(outfile=fullpath, format='paml')
Example #7
0
def pick_otu(spe_out, alignment):
    fin = open(spe_out)
    lines = fin.readlines()
    fin.close()
    fout = open(alignment + ".otu", "w")
    aln = SeqGroup(sequences=alignment)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("Species"):
            nline = lines[i + 1].strip()
            seq = aln.get_seq(nline)
            fout.write(">" + nline + "\n")
            fout.write(seq + "\n")
    fout.close()
Example #8
0
def ngssize(fin, start = 0, end = 2428):
    fout1 = open(fin+".trim.afa", "w")
    fout2 = open(fin+".trim.fa", "w")
    seqs = SeqGroup(fin)
    for seq in seqs:
        name = seq[0]
        sequence = seq[1]
        cut = len(sequence)/2
        sequence_trim = sequence[0:cut]
        sequence_trim_nogap = sequence_trim.replace("-","")
        fout1.write(">" + name + "\n")
        fout2.write(">" + name + "\n")
        fout1.write(sequence_trim + "\n")
        fout2.write(sequence_trim_nogap + "\n")
    
    fout1.close()
    fout2.close()
    return fin+".trim.fa"
Example #9
0
 def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
     missing_leaves = []
     missing_internal = []
     if type(alignment) == SeqGroup:
         alg = alignment
     else:
         alg = SeqGroup(alignment, format=alg_format, **kwargs)
     # sets the seq of
     for n in self.traverse():
         try:
             n.add_feature("sequence", alg.get_seq(n.name))
         except KeyError:
             if n.is_leaf():
                 missing_leaves.append(n.name)
             else:
                 missing_internal.append(n.name)
     if len(missing_leaves) > 0:
         print >>sys.stderr, \
             "Warnning: [%d] terminal nodes could not be found in the alignment." %\
             len(missing_leaves)
Example #10
0
 def get_ref_alignment(self):
     entries = self.jdata["sequences"]
     alignment = SeqGroup()
     for entr in entries:
         alignment.set_seq(entr[0], entr[1])
     return alignment
Example #11
0
def autotest(refseq, reftax, testingtax, tf = "/home/zhangje/GIT/tax_benchmark/script/tmp/"):
    testings = []
    with open(testingtax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            testings.append(ele)
    
    seqs = SeqGroup(refseq)
    ranks = []
    with open(reftax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            ranks.append(ele)
    
    num_corrected_uclust = 0
    num_unchanged_uclust = 0
    
    num_corrected_rdp = 0
    num_unchanged_rdp = 0
    
    num_corrected_blast = 0
    num_unchanged_blast = 0
    
    f_uclust = open(testingtax+".uclust", "w")
    f_rdp = open(testingtax+".rdp", "w")
    f_blast = open(testingtax+".blast", "w")
    f_mis = open(testingtax+".misb", "w")
    f_umis = open(testingtax+".umisb", "w")
    
    for test in testings:
        ru, result_uclust = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "uclust", temfolder = tf)
        f_uclust.write(ru)
        rr, result_rdp = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "rdp", temfolder = tf)
        f_rdp.write(rr)
        rb, result_blast = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "blast", temfolder = tf)
        f_blast.write(rb)
        truth = test[1]
        if len(truth) == 8:
            f_mis.write(test[0] + "	" + rank2string(truth[0:-1]) + "\n")
            rank_nr = int(truth[7])
            if len(result_uclust) > rank_nr and result_uclust[rank_nr] == truth[rank_nr]:
                num_corrected_uclust = num_corrected_uclust + 1
            if len(result_rdp) > rank_nr and result_rdp[rank_nr] == truth[rank_nr]:
                num_corrected_rdp = num_corrected_rdp + 1
            if len(result_blast) > rank_nr and result_blast[rank_nr] == truth[rank_nr]:
                num_corrected_blast = num_corrected_blast + 1
        else:
            f_umis.write(test[0] + "	" + rank2string(truth) + "\n")
            if result_uclust == truth:
                num_unchanged_uclust = num_unchanged_uclust + 1
            if result_rdp == truth:
                num_unchanged_rdp = num_unchanged_rdp + 1
            if result_uclust == truth:
                num_unchanged_blast = num_unchanged_blast + 1
        print("truth:" + repr(truth))
        print("uclust:" + repr(result_uclust))
        print("rdp:"+ repr(result_rdp))
        print("blast:" +repr(result_blast))
    
    f_uclust.close()
    f_rdp.close()
    f_blast.close()
    f_mis.close()
    f_umis.close()
        
    print("method   corrected   unchanged")
    print("uclust"+ "   " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust))
    print("rdp"+ "   " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp))
    print("blast"+ "   " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast))
    
    with open(testingtax+".results", "w") as fo:
        fo.write("method   corrected   unchanged \n")
        fo.write("uclust"+ "   " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust) + "\n")
        fo.write("rdp"+ "   " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp) + "\n")
        fo.write("blast"+ "   " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast) + "\n")
Example #12
0
def curator(refseq, reftax, method, output, testingtax=""):

    start_time = time.time()

    seqs = SeqGroup(refseq)
    ranks = []
    with open(reftax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            ranks.append(ele)
    testings = []
    if len(testingtax) > 1:
        with open(testingtax) as fo:
            for line in fo:
                ll = line.split()
                ele = [ll[0], ll[1].split(";")]
                testings.append(ele)
    else:
        if testingtax:
            partno = int(testingtax)
            partsize = int(len(ranks) / 10)
            p_start = partno * partsize
            if partno == 9:
                p_end = len(ranks)
            else:
                p_end = p_start + partsize
            testings = ranks[p_start:p_end]
        else:
            testings = ranks


#    basepath = os.path.dirname(os.path.abspath(__file__))
    basepath = "/home/kozlovay"
    tmpfolder = basepath + "/tmp/"
    tmpprefix = tmpfolder + str(time.time())

    #    prelim_output = output + ".prelim"

    assf = output + ".ass"

    old_tax = {}
    if os.path.isfile(assf):
        with open(assf, "r") as fi:
            for line in fi:
                seqid, rest = line.strip().split("\t", 1)
                old_tax[seqid] = rest
        print "Found old file with %d assignments; will continue from there..." % len(
            old_tax)

    if True:  #not os.path.isfile(output):
        with open(assf, "a") as fo:
            i = 0
            for test in testings:
                #refseq, reftax, name, old_tax, method, foutput
                i += 1
                if test[0] in old_tax:
                    continue
                tmp_fname = "%s.%d" % (tmpprefix, i)
                ru, result_string = findmis(refseq=seqs,
                                            reftax=ranks,
                                            name=[test[0]],
                                            method=method,
                                            foutput=output,
                                            tmpname=tmp_fname,
                                            refseq_fname=refseq)
                #           print(result_string)
                fo.write(ru)
                fo.flush()

    mis_sid = []
    with open(output, "r") as fmis:
        for line in fmis:
            toks = line.split("\t")
            sid = toks[0]
            lvl = toks[1]
            conf = float(toks[4])
            if lvl != "Species":
                mis_sid += [sid]

    print "Leave-one-out test found %d suspicious sequences; running final test to check them..." % len(
        mis_sid)

    final_output = output + ".final"
    tmp_fname = "%s.%s" % (tmpprefix, "fin")
    findmis(refseq=seqs,
            reftax=ranks,
            name=mis_sid,
            method=method,
            foutput=final_output,
            tmpname=tmp_fname,
            refseq_fname=refseq)

    elapsed_time = time.time() - start_time
    print "\nProcessed %d sequences in %.0f seconds.\n" % (len(testings),
                                                           elapsed_time)
Example #13
0
        regions = []
        for reg in args.target_regions:
            try:
                contig, raw_pos = reg.split(':')
                if not raw_pos:
                    start, end = None, None
                else:
                    start, end = map(int, raw_pos.split('-'))
                regions.append([contig.strip(), start, end])
            except Exception:
                print >>sys.stderr, 'ERROR: Invalid contig region. Use contigid:start-end syntax\n'
                raise
        args.target_regions = regions
        
    if args.target_genes:
        for g in gene_db.find({"sp":int(taxid), "n":{"$in": args.target_genes}}, {"c":1, "s":1, "e":1}):
            print g
            
    if not args.target_regions:
        # If not regions requested, scan all contigs completely 
        args.target_regions = [ [None, None, None] ]

    if args.refseqs:
        from ete2 import SeqGroup
        args.refseqs = SeqGroup(args.refseqs)
               
    main(args)