def __init__(self, refaln, type=""): if type == "fasta": self.aln = SeqGroup(sequences=refaln) else: self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed') self.true_spe = {} self._get_truth() self._get_cluster_label()
def get_hmm_refalignment(self): sites = [] hmp = open(self.refprofile) l = hmp.readline() start = False while l != "": if l.startswith("//"): break if start: l = l.strip() ll = l.split() usedsite = int(ll[5]) sites.append(usedsite) l = hmp.readline() l = hmp.readline() else: if l.startswith("HMM "): start = True l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() hmp.close() align = SeqGroup(self.refalign) fout = open(self.trimed, "w") for entr in align.get_entries(): fout.write(">" + entr[0] + "\n") for pos in sites: fout.write(entr[1][pos - 1]) fout.write("\n") fout.close() return self.trimed, len(sites)
def gentesting(ftaxa, fseq, fout, fold = 10): ftax = open(ftaxa) lines = ftax.readlines() ftax.close() #seqs = SeqGroup(fseq, format='phylip_relaxed') seqs = SeqGroup(fseq) idx = range(len(lines)) random.seed(12345) random.shuffle(idx) numtaxa = len(lines) onefold = int(math.ceil(float(numtaxa) / fold)) idx_list = [] for i in range(fold): start = i * onefold end = (i + 1) * onefold if end > numtaxa: end = numtaxa if i == fold -1 : end = numtaxa idx_list.append(idx[start:end]) for i in range(len(idx_list)): idxi = idx_list[i] f1 = open(fout + repr(i+1) + "testing.tax", "w") f2 = open(fout + repr(i+1) + "testing.fa", "w") for index in idxi: tax = lines[index] seqid = tax.split()[0] seq = seqs.get_seq(seqid) seqnogap = seq.replace("-","") f1.write(tax) f2.write(">" + seqid + "\n") f2.write(seqnogap + "\n") f1.close() f2.close() f1 = open(fout + repr(i+1) + "training.tax", "w") f2 = open(fout + repr(i+1) + "training.afa", "w") f3 = open(fout + repr(i+1) + "training.fa", "w") for j in range(len(idx_list)): if not i==j: idxj = idx_list[j] for index in idxj: tax = lines[index] seqid = tax.split()[0] seq = seqs.get_seq(seqid) seqnogap = seq.replace("-","") f1.write(tax) f2.write(">" + seqid + "\n") f2.write(seq + "\n") f3.write(">" + seqid + "\n") f3.write(seqnogap + "\n") f1.close() f2.close() f3.close()
def curator(refseq, reftax, method, output, testingtax=""): seqs = SeqGroup(refseq) ranks = [] with open(reftax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] ranks.append(ele) testings = [] if testingtax != "": with open(testingtax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] testings.append(ele) else: testings = ranks for test in testings: #refseq, reftax, name, old_tax, method, foutput ru, result_string = findmis(refseq=seqs, reftax=ranks, name=test[0], method=method, foutput=output) print(result_string)
def merge_alignment(aln1, aln2, fout, numsites): seqs1 = SeqGroup(aln1) seqs2 = SeqGroup(aln2) if len(seqs1) == 0 or len(seqs2) == 0: print("No sequences aligned! ") sys.exit() with open(fout, "w") as fo: for seq in seqs1.iter_entries(): if len(seq[1].strip()) == numsites: fo.write(">" + seq[0] + "\n" + seq[1] + "\n") else: print("Error in alignment ....") sys.exit() for seq in seqs2.iter_entries(): if len(seq[1].strip()) == numsites: fo.write(">" + seq[0] + "\n" + seq[1] + "\n") else: print("Error in alignment ....") sys.exit()
def __write_algn(self, fullpath): """ to write algn in paml format """ seq_group = SeqGroup() for n in self: seq_group.id2seq[n.node_id] = n.nt_sequence seq_group.id2name[n.node_id] = n.name seq_group.name2id[n.name] = n.node_id seq_group.write(outfile=fullpath, format='paml')
def pick_otu(spe_out, alignment): fin = open(spe_out) lines = fin.readlines() fin.close() fout = open(alignment + ".otu", "w") aln = SeqGroup(sequences=alignment) for i in range(len(lines)): line = lines[i] if line.startswith("Species"): nline = lines[i + 1].strip() seq = aln.get_seq(nline) fout.write(">" + nline + "\n") fout.write(seq + "\n") fout.close()
def ngssize(fin, start = 0, end = 2428): fout1 = open(fin+".trim.afa", "w") fout2 = open(fin+".trim.fa", "w") seqs = SeqGroup(fin) for seq in seqs: name = seq[0] sequence = seq[1] cut = len(sequence)/2 sequence_trim = sequence[0:cut] sequence_trim_nogap = sequence_trim.replace("-","") fout1.write(">" + name + "\n") fout2.write(">" + name + "\n") fout1.write(sequence_trim + "\n") fout2.write(sequence_trim_nogap + "\n") fout1.close() fout2.close() return fin+".trim.fa"
def link_to_alignment(self, alignment, alg_format="fasta", **kwargs): missing_leaves = [] missing_internal = [] if type(alignment) == SeqGroup: alg = alignment else: alg = SeqGroup(alignment, format=alg_format, **kwargs) # sets the seq of for n in self.traverse(): try: n.add_feature("sequence", alg.get_seq(n.name)) except KeyError: if n.is_leaf(): missing_leaves.append(n.name) else: missing_internal.append(n.name) if len(missing_leaves) > 0: print >>sys.stderr, \ "Warnning: [%d] terminal nodes could not be found in the alignment." %\ len(missing_leaves)
def get_ref_alignment(self): entries = self.jdata["sequences"] alignment = SeqGroup() for entr in entries: alignment.set_seq(entr[0], entr[1]) return alignment
def autotest(refseq, reftax, testingtax, tf = "/home/zhangje/GIT/tax_benchmark/script/tmp/"): testings = [] with open(testingtax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] testings.append(ele) seqs = SeqGroup(refseq) ranks = [] with open(reftax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] ranks.append(ele) num_corrected_uclust = 0 num_unchanged_uclust = 0 num_corrected_rdp = 0 num_unchanged_rdp = 0 num_corrected_blast = 0 num_unchanged_blast = 0 f_uclust = open(testingtax+".uclust", "w") f_rdp = open(testingtax+".rdp", "w") f_blast = open(testingtax+".blast", "w") f_mis = open(testingtax+".misb", "w") f_umis = open(testingtax+".umisb", "w") for test in testings: ru, result_uclust = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "uclust", temfolder = tf) f_uclust.write(ru) rr, result_rdp = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "rdp", temfolder = tf) f_rdp.write(rr) rb, result_blast = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "blast", temfolder = tf) f_blast.write(rb) truth = test[1] if len(truth) == 8: f_mis.write(test[0] + " " + rank2string(truth[0:-1]) + "\n") rank_nr = int(truth[7]) if len(result_uclust) > rank_nr and result_uclust[rank_nr] == truth[rank_nr]: num_corrected_uclust = num_corrected_uclust + 1 if len(result_rdp) > rank_nr and result_rdp[rank_nr] == truth[rank_nr]: num_corrected_rdp = num_corrected_rdp + 1 if len(result_blast) > rank_nr and result_blast[rank_nr] == truth[rank_nr]: num_corrected_blast = num_corrected_blast + 1 else: f_umis.write(test[0] + " " + rank2string(truth) + "\n") if result_uclust == truth: num_unchanged_uclust = num_unchanged_uclust + 1 if result_rdp == truth: num_unchanged_rdp = num_unchanged_rdp + 1 if result_uclust == truth: num_unchanged_blast = num_unchanged_blast + 1 print("truth:" + repr(truth)) print("uclust:" + repr(result_uclust)) print("rdp:"+ repr(result_rdp)) print("blast:" +repr(result_blast)) f_uclust.close() f_rdp.close() f_blast.close() f_mis.close() f_umis.close() print("method corrected unchanged") print("uclust"+ " " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust)) print("rdp"+ " " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp)) print("blast"+ " " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast)) with open(testingtax+".results", "w") as fo: fo.write("method corrected unchanged \n") fo.write("uclust"+ " " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust) + "\n") fo.write("rdp"+ " " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp) + "\n") fo.write("blast"+ " " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast) + "\n")
def curator(refseq, reftax, method, output, testingtax=""): start_time = time.time() seqs = SeqGroup(refseq) ranks = [] with open(reftax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] ranks.append(ele) testings = [] if len(testingtax) > 1: with open(testingtax) as fo: for line in fo: ll = line.split() ele = [ll[0], ll[1].split(";")] testings.append(ele) else: if testingtax: partno = int(testingtax) partsize = int(len(ranks) / 10) p_start = partno * partsize if partno == 9: p_end = len(ranks) else: p_end = p_start + partsize testings = ranks[p_start:p_end] else: testings = ranks # basepath = os.path.dirname(os.path.abspath(__file__)) basepath = "/home/kozlovay" tmpfolder = basepath + "/tmp/" tmpprefix = tmpfolder + str(time.time()) # prelim_output = output + ".prelim" assf = output + ".ass" old_tax = {} if os.path.isfile(assf): with open(assf, "r") as fi: for line in fi: seqid, rest = line.strip().split("\t", 1) old_tax[seqid] = rest print "Found old file with %d assignments; will continue from there..." % len( old_tax) if True: #not os.path.isfile(output): with open(assf, "a") as fo: i = 0 for test in testings: #refseq, reftax, name, old_tax, method, foutput i += 1 if test[0] in old_tax: continue tmp_fname = "%s.%d" % (tmpprefix, i) ru, result_string = findmis(refseq=seqs, reftax=ranks, name=[test[0]], method=method, foutput=output, tmpname=tmp_fname, refseq_fname=refseq) # print(result_string) fo.write(ru) fo.flush() mis_sid = [] with open(output, "r") as fmis: for line in fmis: toks = line.split("\t") sid = toks[0] lvl = toks[1] conf = float(toks[4]) if lvl != "Species": mis_sid += [sid] print "Leave-one-out test found %d suspicious sequences; running final test to check them..." % len( mis_sid) final_output = output + ".final" tmp_fname = "%s.%s" % (tmpprefix, "fin") findmis(refseq=seqs, reftax=ranks, name=mis_sid, method=method, foutput=final_output, tmpname=tmp_fname, refseq_fname=refseq) elapsed_time = time.time() - start_time print "\nProcessed %d sequences in %.0f seconds.\n" % (len(testings), elapsed_time)
regions = [] for reg in args.target_regions: try: contig, raw_pos = reg.split(':') if not raw_pos: start, end = None, None else: start, end = map(int, raw_pos.split('-')) regions.append([contig.strip(), start, end]) except Exception: print >>sys.stderr, 'ERROR: Invalid contig region. Use contigid:start-end syntax\n' raise args.target_regions = regions if args.target_genes: for g in gene_db.find({"sp":int(taxid), "n":{"$in": args.target_genes}}, {"c":1, "s":1, "e":1}): print g if not args.target_regions: # If not regions requested, scan all contigs completely args.target_regions = [ [None, None, None] ] if args.refseqs: from ete2 import SeqGroup args.refseqs = SeqGroup(args.refseqs) main(args)