def splitSegKmeans(aligner, seg, mult, all_reads_list): polisher = Polisher(aligner, aligner.dir_distributor) all_reads = ContigStorage() base = seg.asContig() tmp = [] rtv = readsToVectors(aligner, all_reads_list, base) kmeans = KMeans(n_clusters=mult, precompute_distances=True) recs = list(rtv.values()) result = kmeans.fit_predict(X=[rec.v for rec in recs]) print result clusters = dict() for i, c in enumerate(result): if c not in clusters: clusters[c] = [] clusters[c].append(recs[i].al) for c in clusters.values(): print str(c), ":", len(c) split_contigs = [] split_reads = [] for c in clusters.values(): split_contigs.append( Contig( polisher.polishSmallSegment(base.asSegment(), c).seg_from.Seq(), str(len(split_contigs)))) split_reads.append([al.seg_from.contig for al in c]) maxpi = 1 for i in range(mult): for j in range(mult): if i == j: sys.stdout.write("1.0 ") continue al = aligner.overlapAlign([split_contigs[i]], ContigStorage([split_contigs[j] ])).next() sys.stdout.write(str(al.percentIdentity()) + " ") maxpi = max(maxpi, al.percentIdentity()) print "" print "Maxpi:", maxpi if maxpi < 0.985: return zip(split_contigs, split_reads) else: return None
def readsToVectors(aligner, reads_list, base): als = [] rtv = dict() polisher = Polisher(aligner, aligner.dir_distributor) for al in fixAlDir(aligner.overlapAlign(reads_list, ContigStorage([base])), base): if len(al.seg_to) < len(base) - 100: continue else: als.append(al) rtv[al.seg_from.contig.id] = ReadRecord(al).extend(toVector(al)) reads_list = [al.seg_from.contig for al in als] bases = [base] for base_al1, base_al2, base_al3 in zip(als[0::3], als[1::3], als[2::3]): base_candidate = Contig( polisher.polishSmallSegment( base.asSegment(), [base_al1, base_al2, base_al3]).seg_from.Seq(), str(len(bases))) rtr_als = [] read_ids = set() # base_candidate = base_al.seg_from.asContig() for al in fixAlDir( aligner.overlapAlign(reads_list, ContigStorage([base_candidate])), base_candidate): if len(al.seg_to) < len(base_candidate) - 100: continue else: rtr_als.append(al) read_ids.add(al.seg_from.contig.id) if len(read_ids) == len(als): bases.append(base_candidate) for al in rtr_als: rtv[al.seg_from.contig.id].extend(toVector(al)) if len(bases) > 10: break for rec in rtv.values(): print rec.read.id, len(rec.v), rec.v return rtv
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"
def splitSeg(aligner, seg, mult, all_reads_list): all_reads = ContigStorage() base = seg.asContig() tmp = [] for al in fixAlDir( aligner.overlapAlign(all_reads_list, ContigStorage([base])), base): if len(al.seg_to) < len(base) - 100: continue all_reads.add(al.seg_from.contig) tmp.append(al.seg_from.contig) all_reads_list = tmp split_reads = [] split_contigs = [] for i in range(mult): split_reads.append([]) split_contigs.append(base) cnt = 0 for read in all_reads_list: split_reads[cnt % mult].append(read) polisher = Polisher(aligner, aligner.dir_distributor) for i in range(10): print "Iteration", i split_contigs = [] for reads in split_reads: tmp_als = fixAlDir( aligner.overlapAlign(reads, ContigStorage([base])), base) split_contigs.append( Contig( polisher.polishSmallSegment(base.asSegment(), tmp_als).seg_from.Seq(), str(len(split_contigs)))) bestals = dict() for read in all_reads_list: bestals[read.id] = None for contig in split_contigs: for al in fixAlDir( aligner.overlapAlign(all_reads_list, ContigStorage([contig])), contig): if len(al.seg_to) < len(base) - 100: continue if al.seg_from.contig.id not in bestals: print bestals.keys() print al if bestals[al.seg_from.contig. id] is None or al.percentIdentity() > bestals[ al.seg_from.contig.id].percentIdentity(): bestals[al.seg_from.contig.id] = al # als.append(fixAlDir(aligner.overlapAlign(all_reads_list, ContigStorage([contig])), contig)) # als[-1] = sorted(als[-1], key = lambda al: al.seg_from.contig.id) for i in range(mult): split_reads[i] = [] for rid in bestals: al = bestals[rid] if al is None: print "Warning: no alignment for read", rid else: split_reads[int(al.seg_to.contig.id)].append( al.seg_from.contig) print " ".join(map(str, map(len, split_reads))) maxpi = 0 print "pi matrix:" for i in range(mult): for j in range(mult): al = aligner.overlapAlign([split_contigs[i]], ContigStorage([split_contigs[j] ])).next() sys.stdout.write(str(al.percentIdentity()) + " ") maxpi = max(maxpi, al.percentIdentity()) print "" print "Maxpi:", maxpi if maxpi < 0.985: return zip(split_contigs, split_reads) else: return None