def main(contig_file, reads_file, sam_file, dir, contig_id): # type: (str, str, str, str, str) -> None basic.ensure_dir_existance(dir) contigs = ContigCollection() contigs.loadFromFasta(open(contig_file, "r")) print "Contigs loaded" contig = contigs[contig_id] read_names = set() for rec in Samfile(open(sam_file, "r")): read_names.add(rec.query_name) reads = ReadCollection() cnt = 0 for rec in SeqIO.parse_fasta(open(reads_file, "r")): if rec.id in read_names: rec.id = "Read" + str(cnt) reads.add(AlignedRead(rec)) cnt += 1 reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w")) print "Reads loaded", len(reads) reads.addAllRC() print "RC added", len(reads) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) aligner.alignReadCollection(reads, contigs) print "Reads aligned", len(reads) reads = reads.inter(contig.asSegment()) print "Reads filtered", len(reads) sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left) for read in sorted_reads: print read for al in read.alignmentsTo(contig.asSegment()): print "\n".join(al.asMatchingStrings())
def main(args): rf = args[2] dir = args[3] CreateLog(dir) disjointigs = ContigCollection().loadFromFasta(open(args[1], "r"), num_names=False) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) clen = 5000000 reads = ReadCollection().loadFromFasta(open(rf, "r")) tlen0 = sum(map(len, reads)) for i in range(10): good_reads = set() for al in aligner.localAlign(reads, disjointigs): if not al.contradictingRTC(al.seg_to.contig.asSegment(), 500): good_reads.add(al.seg_from.contig.id) rf = os.path.join(dir, "reads" + str(i) + ".fasta") reads = reads.filter(lambda read: read.id not in good_reads).cleanCopy() tlen = sum(map(len, reads)) reads.print_fasta(open(rf, "w")) l = tlen * clen / tlen0 assembly_dir = os.path.join(dir, "assembly" + str(i)) subprocess.check_call(["./bin/flye", "-o", assembly_dir, "-t", "8", "--pacbio-raw", rf, "--genome-size", str(l), "--no-trestle"]) df= os.path.join(assembly_dir, "10-consensus", "consensus.fasta") disjointigs.addAll(ContigCollection().loadFromFasta(open(df, "r"), num_names=False)) df = os.path.join(dir, "df" + str(i) + ".fasta") disjointigs.print_fasta(open(df, "w"))
def recruit(seqs, reads, k, dir): dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k relevant_reads = ContigStorage() disjointigs = seqs for i in range(2): sys.stdout.info("Recruiting iteration", i) als = filter(lambda al: len(al) > k, aligner.localAlign(reads, disjointigs)) print len(als), "alignments" relevant_reads = alsToReads(als) l = sum(map(len, seqs.unique())) disjointigs = constructDisjointigs(relevant_reads, l, dd.nextDir()) print len(disjointigs), "disjointigs" print disjointigs disjointigs.writeToFasta(open(os.path.join(dir, "disjointigs.fasta"), "w")) relevant_reads.writeToFasta(open(os.path.join(dir, "reads.fasta"), "w")) sys.stdout.info("Aligning repeat sequences to disjointigs") als = list(aligner.localAlign(seqs, disjointigs)) print "\n".join(map(str, als)) starts = dict() for dis in disjointigs: starts[dis.id] = len(dis) for al in als: if len(al) > k: starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left) al = al.rc starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left) print "Starts:" for cid, val in starts.items(): print cid, val contigs = ContigStorage() cnt = 1 for dis in disjointigs: if starts[dis.id] > k and starts[dis.id] < len(dis): print cnt, dis.id, starts[dis.id] contigs.add(Contig(dis.prefix(starts[dis.id]).Seq(), str(cnt))) cnt += 1 for dis in disjointigs.unique(): if len(dis) > k and starts[dis.id] == len(dis): print cnt, dis.id contigs.add(Contig(dis.seq, str(cnt))) cnt += 1 contigs.writeToFasta(open(os.path.join(dir, "contigs.fasta"), "w")) fakeGraph(contigs, open(os.path.join(dir, "graph.gv"), "w"))
def main(dir, contigs_file1, contigs_file2, unique_contigs_file): CreateLog(dir) sys.stdout.level = LogPriority.warning unique = ContigCollection().loadFromFasta( open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) contigs1 = ContigCollection().loadFromFasta(open(contigs_file1, "r"), False) cals1 = list(aligner.overlapAlign(unique.unique(), contigs1)) transfers1, term1, all1 = extract_transfers(contigs1, cals1) contigs2 = ContigCollection().loadFromFasta(open(contigs_file2, "r"), False) cals2 = list(aligner.overlapAlign(unique.unique(), contigs2)) transfers2, term2, all2 = extract_transfers(contigs2, cals2) missing1 = [] missing2 = [] different = dict() unresolved1 = [] unresolved2 = [] same = [] for ucontig in list(unique) + [contig.rc for contig in unique]: uid = ucontig.id in1 = uid in all1 in2 = uid in all2 if not in1 and not in2: continue if not in1: missing1.append(uid) elif not in2: missing2.append(uid) else: if all1[uid][0] == all2[uid][0]: same.append(uid) elif uid in transfers1 and uid in transfers2: different[uid] = (all1[uid][0], all2[uid][0]) elif uid in transfers1: unresolved2.append(uid) elif uid in transfers2: unresolved1.append(uid) out = open(os.path.join(dir, "contigs.txt"), "w") out.write("Different: " + str(different) + "\n") out.write("Unresolved1: " + str(unresolved1) + "\n") out.write("Unresolved2: " + str(unresolved2) + "\n") out.write("Same: " + str(same) + "\n") out.write("Missing1: " + str(missing1) + "\n") out.write("Missing2: " + str(missing2) + "\n") out.write("Contig1 transfers: " + str(transfers1) + "\n") out.write("Contig1 term: " + str(term1) + "\n") out.write("Contig2 transfers: " + str(transfers2) + "\n") out.write("Contig2 term: " + str(term2) + "\n") out.close() print contigs_file1, contigs_file2 print len(different), len(unresolved1), len(unresolved2), len( missing1), len(missing2), len(same)
def main(dir, contigs_files, reference_file, unique_contigs_file): CreateLog(dir) sys.stdout.level = LogPriority.warning ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False) unique = ContigCollection().loadFromFasta(open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) ref_als= list(aligner.overlapAlign(unique.unique(), ref)) ref_transfers, ref_term, all_ref_als = extract_transfers(ref, ref_als) for uid in ref_term: ref_transfers[uid] = ref_term[uid] print "#", "file", "wrong", "unresolved", "correct", "missing" for i, contigs_file in enumerate(contigs_files): contigs = ContigCollection().loadFromFasta(open(contigs_file, "r"), False) contig_als = list(aligner.overlapAlign(unique.unique(), contigs)) contig_transfers, contig_term, all_contig_als = extract_transfers(contigs, contig_als) missing = [] wrong = dict() unresolved = [] correct = [] for uid in ref_transfers: if uid not in contig_transfers and uid not in contig_term: # print uid, "missing" missing.append(uid) elif uid in contig_transfers: if ref_transfers[uid][0] == contig_transfers[uid][0]: # print uid, "correct" correct.append(uid) else: # print uid, "wrong", ref_transfers[uid][0].id, contig_transfers[uid][0].id wrong[uid] = (ref_transfers[uid][0], contig_transfers[uid][0]) else: if ref_transfers[uid][0] == contig_term[uid][0]: # print uid, "correct" correct.append(uid) else: # print uid, "unresolved" unresolved.append(uid) out = open(os.path.join(dir, "contigs_" + str(i) +".txt"), "w") out.write("Wrong: " + str(wrong) + "\n") out.write("Unresolved: " + str(unresolved) + "\n") out.write("Correct: " + str(correct) + "\n") out.write("Missing: " + str(missing) + "\n") out.write("Contig transfers: " + str(contig_transfers) + "\n") out.write("Contig term: " + str(contig_term) + "\n") out.write("Ref transfers: " + str(ref_transfers) + "\n") out.write("Ref als:\n") for c in all_ref_als: out.write(str(c) + "\n") out.write("Contig als:\n") for c in all_contig_als: out.write(str(c) + "\n") out.close() print "result", i, contigs_file, len(wrong), len(unresolved), len(correct), len(missing)
def align(dir, contigs_file): CreateLog(dir) contigs = list(SeqIO.parse_fasta(open(contigs_file, "r"))) assert len(contigs) == 2 contigs = [ Contig(contigs[0].seq, contigs[0].id), Contig(contigs[1].seq, contigs[1].id) ] aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) als = iter_align(aligner, contigs[0], contigs[1]) printVar(os.path.join(dir, "diff.txt"), als) for al in als: print al
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def draw(contigs_file, output_dir, k): aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments"))) CreateLog(output_dir) print "Reading contigs" tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")), key=lambda contig: len(contig)) lens = map(len, tmp)[::-1] print lens contigs = ContigStorage() if lens[1::2] == lens[0::2]: tmp = tmp[0::2] print "Removed extra contigs" for i, contig in enumerate(tmp): print i, contig contigs.add(Contig(contig.seq, str(i))) print "Constructing components" componenets = ExtractRepeatComponents(contigs, aligner, k) print "Components:" for comp in componenets: print comp.segments print comp.alignments for cnt, comp in enumerate(componenets): print "Processing component", cnt print comp.segments # print comp.alignments print "Forming blocks" Block.id_cnt = 0 blocks = CreateBlocks(comp) if len(blocks) == 1: print "Skipping trivial repeat" continue for block in blocks: print "Block", block.id, ":", block.segs for block in blocks: for other in block.out: print block.id, "->", other.id print "Placing blocks on X axis" code = placeX(blocks) if code == 1: print "WARNING: component", cnt, "contains cycle. Aborting visualization." continue print "Placing blocks on Y axis" placeY(blocks, comp.segments) print "Printing figure" SimplePrinter().printBlocks(blocks, sys.stdout) print "Finished printing figure"
def main(contigs_file, parts_file, dir): contigs = ContigCollection().loadFromFasta(open(contigs_file, "r")) parts = ContigCollection().loadFromFasta(open(parts_file, "r")) basic.CreateLog(dir) aligner = Aligner(DirDistributor(dir)) res = dict() for al in aligner.localAlign(parts, contigs): if al.seg_to.contig.id not in res: res[al.seg_to.contig.id] = [] res[al.seg_to.contig.rc.id] = [] res[al.seg_to.contig.id].append(al) res[al.seg_to.contig.rc.id].append(al.rc) for cname, arr in res.items(): print cname arr = filter( lambda al: len(al.seg_to) > min( len(al.seg_to.contig) - 1000, 5000), arr) arr = sorted(arr, key=lambda al: al.seg_to.left) print arr
def evaluatePI(dir, contigs_file, initial_file, ref_file): basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False) ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False) segs = [] for al in aligner.overlapAlign(initial.unique(), contigs): if basic.isCanonocal(al.seg_to.contig.id): segs.append(al.seg_to) else: segs.append(al.rc.seg_to) segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id)) interesting = dict() print "Interesting segments:" for contig in contigs: interesting[contig.id] = [contig.asSegment()] for contig, segit in itertools.groupby(segs, lambda seg: seg.contig): csegs = SegmentStorage().addAll(segit) csegs.mergeSegments() csegs = csegs.reverse(contig) interesting[contig.id] = list(csegs) print list(csegs) print "Analysis of contigs" scorer = Scorer() for al in aligner.localAlign(contigs.unique(), ref): print al for seg in interesting[al.seg_from.contig.id]: if al.seg_from.expand(500).contains( seg) or al.seg_from.interSize(seg) > 40000: tmp_al = al.reduce(query=al.seg_from.cap(seg)) scorer.polyshMatching(tmp_al.matchingSequence(), params.score_counting_radius) print tmp_al.seg_from, tmp_al.seg_to, str(events) print "" print "Analysis of initial" for al in aligner.overlapAlign(initial, ref): scorer.polyshMatching(al.matchingSequence(), params.score_counting_radius) print al.seg_from, al.seg_to, str(events)
def main(ref_file, segment, dir): ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False) chr1 = ref["chr1"] if segment[0] < 0: segment = (-segment[0], -segment[1]) chr1 = chr1.rc reads = ReadCollection() reads_list = [] for i in range(segment[0], segment[1], 500): read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence()) reads_list.append(read) chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:] chr1.rc.seq = basic.RC(chr1.seq) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, ref) out = sys.stdout for read in reads_list: # print read out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n") out.close()
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
class FakePolishingArgs: def __init__(self): self.num_iters = params.num_iters self.platform = params.technology self.threads = params.threads if __name__ == "__main__": reads_file = sys.argv[2] consensus_file = sys.argv[3] dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid] corrected_seq = polisher.polishSegment(
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
def main(contigs_file, contig_name, reads_file, dir, k): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) contig = contigs[contig_name] contigs = ContigStorage() contigs.add(contig) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) als = list(aligner.localAlign(reads.unique(), contigs)) tmp = [] for al in als: if al.seg_to.contig != contig: al = al.rc tmp.append(al) als = tmp als = sorted(als, key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right - al.seg_to.left) counts = dict() for al in als: counts[al.seg_from.contig.id] = 0 for al in als: if len(al) > k: counts[al.seg_from.contig.id] += 1 w = 20 f = open(os.path.join(dir, "reads.fasta"), "w") over = set() inter = set() for al in als: if len(al) < k: continue inter.add(basic.Normalize(al.seg_from.contig.id)) if not al.contradictingRTC(): over.add(basic.Normalize(al.seg_from.contig.id)) m = al.matchingSequence(True) tmp = [] for i in range(len(contig) / w + 1): tmp.append([]) for a, b in m.matches: tmp[b / w].append((a, b)) for i in range(len(contig) / w): if i + 1 < len(tmp) and len(tmp[i + 1]) > 0: tmp[i].append(tmp[i + 1][0]) for i in range(len(contig) / w): seg = contig.segment(i * w, i * w + w) if al.seg_to.inter(seg): if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length: sys.stdout.write("B") elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length: sys.stdout.write("E") else: if len(tmp[i]) == 0: sys.stdout.write("*") else: a = tmp[i][-1][0] - tmp[i][0][0] b = tmp[i][-1][1] - tmp[i][0][1] if a - b > 30: sys.stdout.write("I") elif a - b > 15: sys.stdout.write("i") elif a - b < -30: sys.stdout.write("D") elif a - b < -15: sys.stdout.write("d") else: sys.stdout.write( str(min(8, max(a, b) + 1 - len(tmp[i])))) else: sys.stdout.write("*") print " ", al.seg_from.contig.id, counts[ al.seg_from.contig.id], al.contradictingRTC() print inter for rid in inter: SeqIO.write(reads[rid], f, "fasta") print rid, reads[rid] f.close() f = open(os.path.join(dir, "reads_over.fasta"), "w") for rid in over: SeqIO.write(reads[rid], f, "fasta") f.close()
def assemble(args, bin_path): params.bin_path = bin_path start = time.time() cl_params = Params().parse(args) ref = ContigStorage() if cl_params.test: cl_params.reads_file = os.path.dirname(__file__) + "/../../test_dataset/reads.fasta" cl_params.genome_size = 30000 cl_params.dir = os.path.dirname(__file__) + "/../../test_results" ref.loadFromFile(os.path.dirname(__file__) + "/../../test_dataset/axbctbdy.fasta", False) if cl_params.debug: params.save_alignments = True cl_params.check() CreateLog(cl_params.dir) sys.stdout.info("Command line:", " ".join(cl_params.args)) sys.stdout.info("Started") if cl_params.debug: sys.stdout.info("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) sys.stdout.info("Modifications:") print subprocess.check_output(["git", "diff"]) sys.stdout.info("Preparing initial state") if cl_params.debug: save_handler = SaveHandler(os.path.join(cl_params.dir, "saves")) else: save_handler = None if cl_params.load_from is not None: # tmp = cl_params.focus sys.stdout.info("Loading initial state from saves") cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(TokenReader(open(cl_params.load_from, "r"))) cl_params.parse(args) # cl_params.focus = tmp knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) dot_plot.printAll(sys.stdout) printState(lines) else: aligner = Aligner(DirDistributor(cl_params.alignmentDir())) polisher = Polisher(aligner, aligner.dir_distributor) reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads, cl_params.downsample) if cl_params.contigs_file is None: sys.stdout.info("Running Flye") assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) subprocess.check_call([os.path.join(params.bin_path, "flye"), "--meta", "-o", assembly_dir, "-t", str(cl_params.threads), "--" + params.technology + "-raw", reads_file, "--genome-size", str(cl_params.genome_size), "--min-overlap", str(params.k)]) cl_params.set_flye_dir(assembly_dir, cl_params.mode) elif len(cl_params.disjointigs_file_list) == 0: assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) disjointigs_file = constructDisjointigs(reads, params.expected_size, assembly_dir) # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir) cl_params.disjointigs_file_list.append(disjointigs_file) params.min_contra_for_break = 8 disjointigs = CreateDisjointigCollection(cl_params.disjointigs_file_list, cl_params.dir, aligner, reads) all_unique = cl_params.init_file is not None contigs = CreateContigCollection(cl_params.graph_file, cl_params.contigs_file, cl_params.min_cov, aligner, polisher, reads, cl_params.force_unique, all_unique) if cl_params.autoKL: adjustKL(aligner, reads, contigs) if cl_params.init_file is None: ExtendShortContigs(contigs, reads, aligner, polisher, cl_params.read_dump) lines = CreateLineCollection(cl_params.dir, aligner, contigs, disjointigs, reads, cl_params.split) else: lines = LoadLineCollection(cl_params.dir, cl_params.init_file, aligner, contigs, disjointigs, reads, polisher) sys.stdout.info("Constructing dot plot") dot_plot = LineDotPlot(lines, aligner) dot_plot.construct(aligner) # dot_plot.printAll(sys.stdout) sys.stdout.info("Updating sequences and resolved segments.") knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) extender.updateAllStructures(itertools.chain.from_iterable(line.completely_resolved for line in lines)) for line in list(lines.unique()): # type: NewLine line.completely_resolved.mergeSegments() if len(line.completely_resolved) == 0: lines.removeLine(line) if cl_params.debug: sys.stdout.info( "Saving initial state") try: writer = save_handler.getWriter() sys.stdout.info("Save details:", writer.info) saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot) except Exception as e: _, _, tb = sys.exc_info() sys.stdout.warn("Could not write save") traceback.print_tb(tb) sys.stdout.INFO( "Message:", e.message) sys.stdout.trace( "Disjointig alignments") for line in lines: sys.stdout.trace( line.disjointig_alignments) sys.stdout.info("Starting expanding alignment-consensus loop") EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines, reads, save_handler) dot_plot.printAll(sys.stdout) sys.stdout.trace( "Final result:") lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w")) lines.printKnottedToFasta(open(os.path.join(cl_params.dir, "assembly.fasta"), "w")) printState(lines) sys.stdout.info("Finished") secs = int(time.time() - start) days = secs / 60 / 60 / 24 hours = secs / 60 / 60 % 24 mins = secs / 60 % 60 sys.stdout.info("Finished in %d days, %d hours, %d minutes" % (days, hours, mins)) if cl_params.test: passed = False for al in aligner.dotplotAlign(lines, ref): if len(al) > len(al.seg_to.contig) - 3000: passed = True break if passed: sys.stdout.info("Test passed") else: sys.stdout.info("Test failed")
sys.path.append("py") from common import basic, params from common.basic import CreateLog from alignment.align_tools import Aligner, DirDistributor from common.line_align import Scorer from common.sequences import ContigStorage if __name__ == "__main__": basic.ensure_dir_existance(sys.argv[1]) CreateLog(sys.argv[1]) reads = ContigStorage().loadFromFile(sys.argv[2]) contigs = ContigStorage().loadFromFile(sys.argv[3]) scorer = Scorer() dd = DirDistributor(sys.argv[1]) aligner = Aligner(dd) for read in reads.unique(): print "Processing read", read als = [ scorer.polyshAlignment(al, params.alignment_correction_radius) for al in aligner.localAlign([read], contigs) ] for al1 in als: for al2 in als: if al1.seg_to.contig == al2.seg_to.contig: continue print al1, "vs", al2 scorer.scoreInCorrectSegments(al1, al1.seg_to.contig.asSegment(), al2,
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"
def main(model_file, k, dir, contigs_file, reads_file): # type: (str, int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.scores = ComplexScores() params.scores.load(open(model, "r")) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) for read in reads: if not basic.isCanonocal(read.id): continue cnt = 0 al0 = None others = [] for al in read.alignments: if not al.contradictingRTC(): cnt += 1 al0 = al else: others.append(al) if cnt != 1 or len(others) == 0: continue print al0 print others seg = al0.seg_from for al in others: if al.seg_from.interSize(seg) < k: seg = None break else: seg = al.seg_from.cap(seg) print seg if seg is None: continue al0 = al0.reduce(query=seg) others = [al.reduce(query=seg) for al in others] scorer = Scorer(params.scores) for al in others: a, b, c = scorer.scoreCommon(al0, al) print "win", a, b, c, len(seg) if len(seg) > 1000: for i in range(len(seg) / 1000): seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000) for al in others: a, b, c = scorer.scoreCommon(al0.reduce(query=seg1), al.reduce(query=seg1)) print "win1000", a, b, c, len(seg1) for al1 in others: for al2 in others: if al1 == al2: continue a, b, c = scorer.scoreCommon(al1, al2) print "draw", a, b, c, len(seg)
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length): params.technology = "nano" basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) print " ".join(sys.argv) print "Reading graph" graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")) graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"), True) print "Extracting relevant graph component" edge_ids = edge_id.split(",") to_resolve = to_resolve.split(",") to_resolve = [(a, int(b)) for a, b in zip(to_resolve[0::2], to_resolve[1::2])] unique = uniqueNeighbours(edge_ids, graph, min_contig_length) if rf == "none": return print "Finding reads that align to", edge_ids reads_to_resolve = dict() # type: Dict[str, List[str]] for eid, mult in to_resolve: reads_to_resolve[eid] = [] for unique_edge, initial in unique: reads_to_resolve[initial] = [] relevant_read_ids = set() for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if eid in edge_ids: relevant_read_ids.add(rid) print rid, eid for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if rid in relevant_read_ids and eid in reads_to_resolve: reads_to_resolve[eid].append(rid) for eid in reads_to_resolve: reads_to_resolve[eid] = list(set(reads_to_resolve[eid])) print "Reading reads" res_reads = ContigStorage() res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_by_name(rf): if read.id in relevant_read_ids: res_reads.add(Contig(read.seq, read.id)) SeqIO.write(read, res, "fasta") res.close() random_down = open(os.path.join(dir, "random_down.fasta"), "w") cnt = 0 for read in res_reads: if cnt % 5 == 0: SeqIO.write(read, random_down, "fasta") cnt += 1 random_down.close() res = open(os.path.join(dir, "contigs.fasta"), "w") lcf = open(os.path.join(dir, "contigs.lc"), "w") for eid, mult in to_resolve: repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]] print reads_to_resolve[eid] print map(str, repeat_reads) split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult, repeat_reads, min_contig_length) if split_contigs is None: print "Failed to resove edge", eid, "Aborting" print "Edge", eid, "was split into", mult, "copies" for contig, contig_reads in split_contigs: print contig.id SeqIO.write(contig, res, "fasta") lcf.write(contig.id + "\n") lcf.write(" ".join([r.id for r in contig_reads]) + "\n") res = open(os.path.join(dir, "contigs.fasta"), "w") for unique_edge, initial in unique: print unique_edge.id SeqIO.write(unique_edge, res, "fasta") lcf.write(unique_edge.id + "\n") lcf.write(" ".join(reads_to_resolve[initial]) + "\n") res.close()