def main(contig_file, reads_file, sam_file, dir, contig_id): # type: (str, str, str, str, str) -> None basic.ensure_dir_existance(dir) contigs = ContigCollection() contigs.loadFromFasta(open(contig_file, "r")) print "Contigs loaded" contig = contigs[contig_id] read_names = set() for rec in Samfile(open(sam_file, "r")): read_names.add(rec.query_name) reads = ReadCollection() cnt = 0 for rec in SeqIO.parse_fasta(open(reads_file, "r")): if rec.id in read_names: rec.id = "Read" + str(cnt) reads.add(AlignedRead(rec)) cnt += 1 reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w")) print "Reads loaded", len(reads) reads.addAllRC() print "RC added", len(reads) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) aligner.alignReadCollection(reads, contigs) print "Reads aligned", len(reads) reads = reads.inter(contig.asSegment()) print "Reads filtered", len(reads) sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left) for read in sorted_reads: print read for al in read.alignmentsTo(contig.asSegment()): print "\n".join(al.asMatchingStrings())
def main(args): rf = args[2] dir = args[3] CreateLog(dir) disjointigs = ContigCollection().loadFromFasta(open(args[1], "r"), num_names=False) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) clen = 5000000 reads = ReadCollection().loadFromFasta(open(rf, "r")) tlen0 = sum(map(len, reads)) for i in range(10): good_reads = set() for al in aligner.localAlign(reads, disjointigs): if not al.contradictingRTC(al.seg_to.contig.asSegment(), 500): good_reads.add(al.seg_from.contig.id) rf = os.path.join(dir, "reads" + str(i) + ".fasta") reads = reads.filter(lambda read: read.id not in good_reads).cleanCopy() tlen = sum(map(len, reads)) reads.print_fasta(open(rf, "w")) l = tlen * clen / tlen0 assembly_dir = os.path.join(dir, "assembly" + str(i)) subprocess.check_call(["./bin/flye", "-o", assembly_dir, "-t", "8", "--pacbio-raw", rf, "--genome-size", str(l), "--no-trestle"]) df= os.path.join(assembly_dir, "10-consensus", "consensus.fasta") disjointigs.addAll(ContigCollection().loadFromFasta(open(df, "r"), num_names=False)) df = os.path.join(dir, "df" + str(i) + ".fasta") disjointigs.print_fasta(open(df, "w"))
def main(ref_file, contig_size, rlen, cov, dir): basic.ensure_dir_existance(dir) all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False) contig_file_name = os.path.join(dir, "contigs.fasta") contig_file = open(contig_file_name, "w") reads_file_name = os.path.join(dir, "reads.fasta") reads_file = open(reads_file_name, "w") for ref in all_contigs.unique(): if len(ref) < contig_size: continue SeqIO.write(ref, contig_file, "fasta") for i in range(0, len(ref), max(1, rlen / cov)): read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence() SeqIO.write(read, reads_file, "fasta") reads_file.close() contig_file.close() print "Done" print contig_file_name print reads_file_name
def main(contigs_file, parts_file, dir): contigs = ContigCollection().loadFromFasta(open(contigs_file, "r")) parts = ContigCollection().loadFromFasta(open(parts_file, "r")) basic.CreateLog(dir) aligner = Aligner(DirDistributor(dir)) res = dict() for al in aligner.localAlign(parts, contigs): if al.seg_to.contig.id not in res: res[al.seg_to.contig.id] = [] res[al.seg_to.contig.rc.id] = [] res[al.seg_to.contig.id].append(al) res[al.seg_to.contig.rc.id].append(al.rc) for cname, arr in res.items(): print cname arr = filter( lambda al: len(al.seg_to) > min( len(al.seg_to.contig) - 1000, 5000), arr) arr = sorted(arr, key=lambda al: al.seg_to.left) print arr
def alignReadsToSegments(self, reads, segments): # type: (ReadCollection, Iterable[Segment]) -> None segments = list(segments) seg_dict = dict() for i, seg in enumerate(segments): seg_dict[str(i + 1)] = seg contigs = map(lambda (i, seg): Contig(seg.Seq(), str(i + 1)), enumerate(segments)) read_collection = ReadCollection().extendClean(reads) self.alignReadCollection(read_collection, ContigCollection(contigs)) read_collection.contigsAsSegments(seg_dict) reads.mergeAlignments(read_collection)
def alignReadCollection(self, reads_collection, contigs): # type: (ReadCollection, Iterable[Contig]) -> None contig_collection = ContigCollection(contigs) contig_ids = set() for contig in contigs: if contig.rc.id not in contig_ids: contig_ids.add(contig.id) read_ids = set() for read in reads_collection: if read.rc.id not in read_ids: read_ids.add(read.id) contigs = filter(lambda contig: contig.id in contig_ids, contigs) reads = filter(lambda read: read.id in read_ids, reads_collection) reads_collection.fillFromSam(self.align(reads, contigs, "local"), contig_collection)
def loadAll(handler): # type: (TokenReader) -> Tuple[Params, Aligner, ContigCollection, ReadCollection, DisjointigCollection, NewLineStorage, LineDotPlot] cl_params = Params() cl_params.load(handler) aligner = Aligner.load(handler) sys.stdout.info("Loading contigs") contigs = ContigCollection() contigs.load(handler) sys.stdout.info("Loading reads") reads = CreateReadCollection(cl_params.reads_file, cl_params.downsample) reads.loadFromFasta(open(cl_params.reads_file, "r"), downsample=params.downsample) tmp_reads = reads.copy().addAllRC() sys.stdout.info("Loading disjointigs") disjointigs = DisjointigCollection() disjointigs.load(handler, tmp_reads) sys.stdout.info("Loading lines") lines = NewLineStorage(disjointigs, aligner) lines.load(handler, tmp_reads, contigs) sys.stdout.info("Loading dot plot") dot_plot = LineDotPlot(lines, aligner) dot_plot.load(handler) sys.stdout.info("Loading finished") return cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot
def main(dir, contigs_file1, contigs_file2, unique_contigs_file): CreateLog(dir) sys.stdout.level = LogPriority.warning unique = ContigCollection().loadFromFasta( open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) contigs1 = ContigCollection().loadFromFasta(open(contigs_file1, "r"), False) cals1 = list(aligner.overlapAlign(unique.unique(), contigs1)) transfers1, term1, all1 = extract_transfers(contigs1, cals1) contigs2 = ContigCollection().loadFromFasta(open(contigs_file2, "r"), False) cals2 = list(aligner.overlapAlign(unique.unique(), contigs2)) transfers2, term2, all2 = extract_transfers(contigs2, cals2) missing1 = [] missing2 = [] different = dict() unresolved1 = [] unresolved2 = [] same = [] for ucontig in list(unique) + [contig.rc for contig in unique]: uid = ucontig.id in1 = uid in all1 in2 = uid in all2 if not in1 and not in2: continue if not in1: missing1.append(uid) elif not in2: missing2.append(uid) else: if all1[uid][0] == all2[uid][0]: same.append(uid) elif uid in transfers1 and uid in transfers2: different[uid] = (all1[uid][0], all2[uid][0]) elif uid in transfers1: unresolved2.append(uid) elif uid in transfers2: unresolved1.append(uid) out = open(os.path.join(dir, "contigs.txt"), "w") out.write("Different: " + str(different) + "\n") out.write("Unresolved1: " + str(unresolved1) + "\n") out.write("Unresolved2: " + str(unresolved2) + "\n") out.write("Same: " + str(same) + "\n") out.write("Missing1: " + str(missing1) + "\n") out.write("Missing2: " + str(missing2) + "\n") out.write("Contig1 transfers: " + str(transfers1) + "\n") out.write("Contig1 term: " + str(term1) + "\n") out.write("Contig2 transfers: " + str(transfers2) + "\n") out.write("Contig2 term: " + str(term2) + "\n") out.close() print contigs_file1, contigs_file2 print len(different), len(unresolved1), len(unresolved2), len( missing1), len(missing2), len(same)
def main(dir, contigs_files, reference_file, unique_contigs_file): CreateLog(dir) sys.stdout.level = LogPriority.warning ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False) unique = ContigCollection().loadFromFasta(open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) ref_als= list(aligner.overlapAlign(unique.unique(), ref)) ref_transfers, ref_term, all_ref_als = extract_transfers(ref, ref_als) for uid in ref_term: ref_transfers[uid] = ref_term[uid] print "#", "file", "wrong", "unresolved", "correct", "missing" for i, contigs_file in enumerate(contigs_files): contigs = ContigCollection().loadFromFasta(open(contigs_file, "r"), False) contig_als = list(aligner.overlapAlign(unique.unique(), contigs)) contig_transfers, contig_term, all_contig_als = extract_transfers(contigs, contig_als) missing = [] wrong = dict() unresolved = [] correct = [] for uid in ref_transfers: if uid not in contig_transfers and uid not in contig_term: # print uid, "missing" missing.append(uid) elif uid in contig_transfers: if ref_transfers[uid][0] == contig_transfers[uid][0]: # print uid, "correct" correct.append(uid) else: # print uid, "wrong", ref_transfers[uid][0].id, contig_transfers[uid][0].id wrong[uid] = (ref_transfers[uid][0], contig_transfers[uid][0]) else: if ref_transfers[uid][0] == contig_term[uid][0]: # print uid, "correct" correct.append(uid) else: # print uid, "unresolved" unresolved.append(uid) out = open(os.path.join(dir, "contigs_" + str(i) +".txt"), "w") out.write("Wrong: " + str(wrong) + "\n") out.write("Unresolved: " + str(unresolved) + "\n") out.write("Correct: " + str(correct) + "\n") out.write("Missing: " + str(missing) + "\n") out.write("Contig transfers: " + str(contig_transfers) + "\n") out.write("Contig term: " + str(contig_term) + "\n") out.write("Ref transfers: " + str(ref_transfers) + "\n") out.write("Ref als:\n") for c in all_ref_als: out.write(str(c) + "\n") out.write("Contig als:\n") for c in all_contig_als: out.write(str(c) + "\n") out.close() print "result", i, contigs_file, len(wrong), len(unresolved), len(correct), len(missing)
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def main(ref_file, segment, dir): ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False) chr1 = ref["chr1"] if segment[0] < 0: segment = (-segment[0], -segment[1]) chr1 = chr1.rc reads = ReadCollection() reads_list = [] for i in range(segment[0], segment[1], 500): read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence()) reads_list.append(read) chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:] chr1.rc.seq = basic.RC(chr1.seq) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, ref) out = sys.stdout for read in reads_list: # print read out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n") out.close()
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs
def main(argv): sys.stdout.write("Started\n") dot_file = argv[1] edge_sequences = argv[2] reference_file = argv[3] alignment_file = argv[4] edges = ParseVertices(argv[5]) output_file = argv[6] sys.stdout.write("Loading dot\n") dot = DotParser(open(dot_file, "r")).parse() edge_collection = ContigCollection().loadFromFasta( open(edge_sequences, "r"), True) graph = Graph().loadFromDot(edge_collection, dot) vertices = [graph.E[id].start.id for id in edges] graph.printToFile(sys.stdout) print vertices ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False) print "Looking for relevant" pq = PriorityQueue() for v in graph.V.values(): if v.id in vertices: pq.push((0, v)) visited = [] while not pq.empty(): d, v = pq.pop() if v in visited: continue visited.append(v) for e in v.inc: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.start)) for e in v.out: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.end)) print "Visited", len(visited) print map(str, list(visited)) relevant = [] edge_alignments = ReadCollection().loadFromFasta(open(edge_sequences, "r")).addAllRC() for edge in graph.E.values(): if edge.start in visited or edge.start.rc in visited: relevant.append(edge_alignments[edge.id]) print "Loading sam" edge_alignments.fillFromSam(Samfile(open(alignment_file, "r")), ref) for rel in relevant: print rel.__str__() print "Collecting segments" segments = [] chr1 = ref["chr1"] for edge in relevant: for al in edge.alignments: print al if al.seg_from.inter(edge.prefix(dist)): l = dist - al.seg_from.left contig = al.seg_to.contig start = al.seg_to.left segments.append( Segment(contig, start, min(start + l, len(contig)))) print segments[-1] tmp = [] print "Rotating" for seg in segments: if seg.contig != chr1: seg = seg.RC() if seg.contig != chr1: print "WARNING", seg tmp.append(seg) segments = sorted(tmp, key=lambda seg: seg.left) print "All relevant segments" print "\n".join(map(str, segments)) cur_seg = None interesting_segments = [] print "Gluing" for seg in segments: if cur_seg is None: cur_seg = seg.copy() continue if cur_seg.right + 20000 < seg.left: interesting_segments.append(cur_seg.copy()) cur_seg = seg.copy() else: cur_seg.right = max(cur_seg.right, seg.right) if cur_seg is not None: interesting_segments.append(cur_seg.copy()) alignments = [] for edge in edge_alignments: for al in edge.alignments: ok = False for seg in interesting_segments: if al.seg_to.inter(seg): alignments.append(al) alignments = sorted(alignments, key=lambda al: al.seg_to.left) print "All relevant alignments" print "\n".join(map(str, alignments)) print "Interesting segments:", len(interesting_segments), sum( map(len, interesting_segments)) for seg in interesting_segments: print seg f = open(output_file, "w") tmp = [] for seg in interesting_segments: SeqIO.write(SeqIO.SeqRecord(seg.Seq(), seg.__str__()), f, "fasta") tmp.append(seg.Seq()) f.close() f1 = open(output_file + "1", "w") SeqIO.write(SeqIO.SeqRecord(("N" * 20000).join(tmp), "concat"), f1, "fasta")
als = filter(lambda al: not al.contradictingRTC(tail_size=params.bad_end_length), als) return self.filterLocal(als) if __name__ == "__main__": dir = sys.argv[1] query = sys.argv[2] target = sys.argv[3] extra_params = sys.argv[4:] contra = "contra" in extra_params over = "over" in extra_params long = "long" in extra_params start = "start" in extra_params forward = "forward" in extra_params aln = Aligner(DirDistributor(dir)) basic.CreateLog(dir) contigs = ContigCollection().loadFromFasta(open(target, "r"), False) for al in aln.localAlign(ReadCollection().loadFromFile(query), contigs): if start: if al.seg_to.contig.id.startswith("-"): al = al.rc if al.seg_to.left > 50: continue if over and al.contradictingRTC(): continue if forward: if al.seg_to.contig.id.startswith("-"): al = al.rc if contra and (len(al) < 8000 or not al.contradictingRTC()): continue if long and len(al) < 5000: continue