def main(args): rf = args[2] dir = args[3] CreateLog(dir) disjointigs = ContigCollection().loadFromFasta(open(args[1], "r"), num_names=False) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) clen = 5000000 reads = ReadCollection().loadFromFasta(open(rf, "r")) tlen0 = sum(map(len, reads)) for i in range(10): good_reads = set() for al in aligner.localAlign(reads, disjointigs): if not al.contradictingRTC(al.seg_to.contig.asSegment(), 500): good_reads.add(al.seg_from.contig.id) rf = os.path.join(dir, "reads" + str(i) + ".fasta") reads = reads.filter(lambda read: read.id not in good_reads).cleanCopy() tlen = sum(map(len, reads)) reads.print_fasta(open(rf, "w")) l = tlen * clen / tlen0 assembly_dir = os.path.join(dir, "assembly" + str(i)) subprocess.check_call(["./bin/flye", "-o", assembly_dir, "-t", "8", "--pacbio-raw", rf, "--genome-size", str(l), "--no-trestle"]) df= os.path.join(assembly_dir, "10-consensus", "consensus.fasta") disjointigs.addAll(ContigCollection().loadFromFasta(open(df, "r"), num_names=False)) df = os.path.join(dir, "df" + str(i) + ".fasta") disjointigs.print_fasta(open(df, "w"))
def main(contig_file, reads_file, sam_file, dir, contig_id): # type: (str, str, str, str, str) -> None basic.ensure_dir_existance(dir) contigs = ContigCollection() contigs.loadFromFasta(open(contig_file, "r")) print "Contigs loaded" contig = contigs[contig_id] read_names = set() for rec in Samfile(open(sam_file, "r")): read_names.add(rec.query_name) reads = ReadCollection() cnt = 0 for rec in SeqIO.parse_fasta(open(reads_file, "r")): if rec.id in read_names: rec.id = "Read" + str(cnt) reads.add(AlignedRead(rec)) cnt += 1 reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w")) print "Reads loaded", len(reads) reads.addAllRC() print "RC added", len(reads) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) aligner.alignReadCollection(reads, contigs) print "Reads aligned", len(reads) reads = reads.inter(contig.asSegment()) print "Reads filtered", len(reads) sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left) for read in sorted_reads: print read for al in read.alignmentsTo(contig.asSegment()): print "\n".join(al.asMatchingStrings())
def alignReadsToSegments(self, reads, segments): # type: (ReadCollection, Iterable[Segment]) -> None segments = list(segments) seg_dict = dict() for i, seg in enumerate(segments): seg_dict[str(i + 1)] = seg contigs = map(lambda (i, seg): Contig(seg.Seq(), str(i + 1)), enumerate(segments)) read_collection = ReadCollection().extendClean(reads) self.alignReadCollection(read_collection, ContigCollection(contigs)) read_collection.contigsAsSegments(seg_dict) reads.mergeAlignments(read_collection)
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def CreateReadCollection(reads_file, cut_reads, downsample): sys.stdout.info("Creating read collection") num = params.downsample if downsample < 1: sys.stdout.info("Downsampling:", downsample) reads = ReadCollection() reads.loadFromFile(reads_file) num = int(reads.__len__() * downsample) reads = ReadCollection() reads.loadFromFile(reads_file, num, cut_reads) return reads
def polishAndAnalyse(self, reads, polishing_base, reliable_start = None): # type: (ReadCollection, Contig, Optional[int]) -> Consensus if reliable_start is None: reliable_start = len(polishing_base) seq = Contig(self.polish(reads, polishing_base), "contig") res = [0] * (len(seq) + 1) alignment = ReadCollection().extendClean(reads) self.aligner.alignReadCollection(alignment, [seq]) contra = 0 ok = 0 late = 0 for read in alignment: for al in read.alignmentsTo(seq.asSegment()):# type: AlignmentPiece if al.contradicting(seq.asSegment()): contra += 1 elif al.seg_to.left > reliable_start: late += 1 else: res[al.seg_to.left] += 1 res[al.seg_to.right] -= 1 ok += 1 for i in range(1, len(res)): res[i] += res[i - 1] sys.stdout.trace("Polyshed and analysed using", len(alignment), "reads. Ok:", ok, "late:", late, "contra:", contra) # if contra > 10 or contra > ok / 2: # for read in alignment: # print read # for al in read.alignmentsTo(seq.asSegment()): # if al.contradictingRTC(seq.asSegment()): # print "contra_al:", al # elif al.seg_to.left > reliable_start: # print "late_al:", al # else: # print "ok_al:", al return Consensus(seq.seq, res)
def main(ref_file, segment, dir): ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False) chr1 = ref["chr1"] if segment[0] < 0: segment = (-segment[0], -segment[1]) chr1 = chr1.rc reads = ReadCollection() reads_list = [] for i in range(segment[0], segment[1], 500): read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence()) reads_list.append(read) chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:] chr1.rc.seq = basic.RC(chr1.seq) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, ref) out = sys.stdout for read in reads_list: # print read out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n") out.close()
def genAll(self, aligner): # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection] disjointigs = DisjointigCollection() for dis in self.disjointigs: disjointigs.addNew(dis.seq, dis.id) from disjointig_resolve.line_storage import NewLineStorage lines = NewLineStorage(disjointigs, aligner) lines.name_printer = lambda line: line.id + "_" + self.translateBack( line, aligner) for line in self.contigs: new_line = lines.addNew(line.seq, line.id) new_line.initial.add( AlignmentPiece.Identical( new_line.asSegment().asContig().asSegment(), new_line.asSegment())) dp = LineDotPlot(lines, aligner) dp.construct(aligner) lines.alignDisjointigs() reads = ReadCollection() for read in self.reads: reads.addNewRead(read) disjointigs.addAlignments(aligner.localAlign(reads, disjointigs)) return lines, dp, reads
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None): # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]] if max_extension is None: max_extension = 10000000000 scorer = Scorer() contig = als[0].seg_to.contig max_len = max_extension + len(contig) sys.stdout.trace("Polishing end of", als[0].seg_to.contig) new_contig = contig.asSegment().asContig() relevant_als = [ al.changeTargetContig(new_contig) for al in als if al.rc.seg_to.left < 100 ] finished_als = [] while True: tmp = [] for al in relevant_als: if al.seg_to.inter(new_contig.asSegment().suffix( length=100)) and al.rc.seg_from.left > 100: tmp.append(al) else: finished_als.append(al) relevant_als = tmp if len(relevant_als) < min_cov: break start = "ACGTTCGA" + basic.randomSequence( params.flanking_size) + new_contig.asSegment().suffix( length=min(params.flanking_size, len(new_contig))).Seq() reduced_read_list = [ AlignedRead.new( start + al.seg_from.contig.asSegment().suffix( pos=al.seg_from.right).Seq(), str(i) + "_" + al.seg_from.contig.id) for i, al in enumerate(relevant_als) ] reduced_reads = ReadCollection(reduced_read_list) found = False for base_al in relevant_als: if base_al.rc.seg_from.left < params.flanking_size: continue # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500 base_segment = base_al.seg_from.contig.segment( base_al.seg_from.right, min( len(base_al.seg_from.contig), base_al.seg_from.right + max(params.window_size, params.k))) base = Contig(start + base_segment.Seq(), "base") for read in reduced_read_list: read.clean() polished_base = Contig(self.polish(reduced_reads, base), "polished_base") for al in self.aligner.localAlign( reduced_reads, ContigStorage().addAll([polished_base])): reduced_reads.reads[al.seg_from.contig.id].addAlignment(al) candidate_alignments = [] for read in reduced_read_list: candidate_alignments.append(None) for al in read.alignmentsTo(polished_base.asSegment()): if al.seg_to.left == 0 and ( (candidate_alignments[-1] is None or candidate_alignments[-1].seg_to.right < al.seg_to.right)): candidate_alignments[-1] = al trimmedAlignments = [] for i, al in enumerate(candidate_alignments): assert al is not None, reduced_read_list[i] trimmedAlignments.append(al.trimByQuality(0.4, 100)) contra_index = 0 contra = [] support = len(trimmedAlignments) cutoff_pos = len(start) for al in sorted(trimmedAlignments, key=lambda al: al.seg_to.right): while contra_index < len(contra) and contra[ contra_index].seg_to.right < al.seg_to.right - 50: contra_index += 1 if support >= min_cov and len(contra) - contra_index <= ( 1 - min_cov_frac) * support: cutoff_pos = al.seg_to.right support -= 1 if al.contradictingRTCRight(): contra.append(al) else: sys.stdout.trace("Stopped at:", support, contra_index, (1 - min_cov_frac) * support) break sys.stdout.trace("Positions:", [al.seg_to.right for al in trimmedAlignments]) sys.stdout.trace("Contra:", contra) if cutoff_pos > len(start) + 100: sys.stdout.trace("Chose to use read", base_al.__repr__(), "Extended for", cutoff_pos - len(start), "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) found = True new_contig_candidate = Contig( new_contig.seq + polished_base[len(start):cutoff_pos], "candidate") embedding = AlignmentPiece.Identical( polished_base.segment(len(start), cutoff_pos), new_contig_candidate.asSegment().suffix( pos=len(new_contig))) read_mappings = [] for al1, al2 in zip(candidate_alignments, relevant_als): seg_from = al2.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) seg_to = al1.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) read_mappings.append( AlignmentPiece.Identical(seg_from, seg_to)) embedded_alignments = [] for al1, al2 in zip(candidate_alignments, read_mappings): if al1.seg_to.right <= len(start) + 10: embedded_alignments.append(None) else: tmp = al2.compose(al1) if tmp.seg_to.left > embedding.seg_from.right - 10: embedded_alignments.append(None) else: embedded_alignments.append( tmp.compose(embedding)) corrected_relevant_alignments = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in relevant_als ] relevant_als = [] for al1, al2 in zip(corrected_relevant_alignments, embedded_alignments): if al2 is None: al = al1 else: al = al1.mergeDistant(al2) if al is None: al = al1 elif al1.seg_from.dist( al2.seg_from) >= 10 or al1.seg_to.dist( al2.seg_to) >= 10: al = scorer.polyshAlignment( al, params.alignment_correction_radius) relevant_als.append(al) finished_als = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in finished_als ] new_contig = new_contig_candidate break else: sys.stdout.trace("Could not prolong with read", base_al, "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) if len(new_contig) >= max_len: break if not found: break return new_contig, relevant_als + finished_als
def main(argv): sys.stdout.write("Started\n") dot_file = argv[1] edge_sequences = argv[2] reference_file = argv[3] alignment_file = argv[4] edges = ParseVertices(argv[5]) output_file = argv[6] sys.stdout.write("Loading dot\n") dot = DotParser(open(dot_file, "r")).parse() edge_collection = ContigCollection().loadFromFasta( open(edge_sequences, "r"), True) graph = Graph().loadFromDot(edge_collection, dot) vertices = [graph.E[id].start.id for id in edges] graph.printToFile(sys.stdout) print vertices ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False) print "Looking for relevant" pq = PriorityQueue() for v in graph.V.values(): if v.id in vertices: pq.push((0, v)) visited = [] while not pq.empty(): d, v = pq.pop() if v in visited: continue visited.append(v) for e in v.inc: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.start)) for e in v.out: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.end)) print "Visited", len(visited) print map(str, list(visited)) relevant = [] edge_alignments = ReadCollection().loadFromFasta(open(edge_sequences, "r")).addAllRC() for edge in graph.E.values(): if edge.start in visited or edge.start.rc in visited: relevant.append(edge_alignments[edge.id]) print "Loading sam" edge_alignments.fillFromSam(Samfile(open(alignment_file, "r")), ref) for rel in relevant: print rel.__str__() print "Collecting segments" segments = [] chr1 = ref["chr1"] for edge in relevant: for al in edge.alignments: print al if al.seg_from.inter(edge.prefix(dist)): l = dist - al.seg_from.left contig = al.seg_to.contig start = al.seg_to.left segments.append( Segment(contig, start, min(start + l, len(contig)))) print segments[-1] tmp = [] print "Rotating" for seg in segments: if seg.contig != chr1: seg = seg.RC() if seg.contig != chr1: print "WARNING", seg tmp.append(seg) segments = sorted(tmp, key=lambda seg: seg.left) print "All relevant segments" print "\n".join(map(str, segments)) cur_seg = None interesting_segments = [] print "Gluing" for seg in segments: if cur_seg is None: cur_seg = seg.copy() continue if cur_seg.right + 20000 < seg.left: interesting_segments.append(cur_seg.copy()) cur_seg = seg.copy() else: cur_seg.right = max(cur_seg.right, seg.right) if cur_seg is not None: interesting_segments.append(cur_seg.copy()) alignments = [] for edge in edge_alignments: for al in edge.alignments: ok = False for seg in interesting_segments: if al.seg_to.inter(seg): alignments.append(al) alignments = sorted(alignments, key=lambda al: al.seg_to.left) print "All relevant alignments" print "\n".join(map(str, alignments)) print "Interesting segments:", len(interesting_segments), sum( map(len, interesting_segments)) for seg in interesting_segments: print seg f = open(output_file, "w") tmp = [] for seg in interesting_segments: SeqIO.write(SeqIO.SeqRecord(seg.Seq(), seg.__str__()), f, "fasta") tmp.append(seg.Seq()) f.close() f1 = open(output_file + "1", "w") SeqIO.write(SeqIO.SeqRecord(("N" * 20000).join(tmp), "concat"), f1, "fasta")
return self.filterLocal(als) if __name__ == "__main__": dir = sys.argv[1] query = sys.argv[2] target = sys.argv[3] extra_params = sys.argv[4:] contra = "contra" in extra_params over = "over" in extra_params long = "long" in extra_params start = "start" in extra_params forward = "forward" in extra_params aln = Aligner(DirDistributor(dir)) basic.CreateLog(dir) contigs = ContigCollection().loadFromFasta(open(target, "r"), False) for al in aln.localAlign(ReadCollection().loadFromFile(query), contigs): if start: if al.seg_to.contig.id.startswith("-"): al = al.rc if al.seg_to.left > 50: continue if over and al.contradictingRTC(): continue if forward: if al.seg_to.contig.id.startswith("-"): al = al.rc if contra and (len(al) < 8000 or not al.contradictingRTC()): continue if long and len(al) < 5000: continue sys.stdout.write(str(len(al)) + " ")
def cutReads(rf, cut_len): # type: (str, int) -> None rc = ReadCollection().loadFromFasta(open(rf, "r"), None, cut_len) rc.print_fasta(sys.stdout)
def main(model_file, k, dir, contigs_file, reads_file): # type: (str, int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.scores = ComplexScores() params.scores.load(open(model, "r")) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) for read in reads: if not basic.isCanonocal(read.id): continue cnt = 0 al0 = None others = [] for al in read.alignments: if not al.contradictingRTC(): cnt += 1 al0 = al else: others.append(al) if cnt != 1 or len(others) == 0: continue print al0 print others seg = al0.seg_from for al in others: if al.seg_from.interSize(seg) < k: seg = None break else: seg = al.seg_from.cap(seg) print seg if seg is None: continue al0 = al0.reduce(query=seg) others = [al.reduce(query=seg) for al in others] scorer = Scorer(params.scores) for al in others: a, b, c = scorer.scoreCommon(al0, al) print "win", a, b, c, len(seg) if len(seg) > 1000: for i in range(len(seg) / 1000): seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000) for al in others: a, b, c = scorer.scoreCommon(al0.reduce(query=seg1), al.reduce(query=seg1)) print "win1000", a, b, c, len(seg1) for al1 in others: for al2 in others: if al1 == al2: continue a, b, c = scorer.scoreCommon(al1, al2) print "draw", a, b, c, len(seg)