def analyseSegments(self, segs): # type: (List[Segment]) -> None contigs = ContigStorage() contigs.addAll([seg.asContig() for seg in segs if len(seg) > 5000]) res = [] # type: List[Segment] for al in self.aligner.overlapAlign(self.reads, contigs): if basic.isCanonocal(al.seg_to.contig.id): res.append(al.seg_to) else: res.append(al.seg_to.RC()) res = sorted(res, key=lambda seg: (seg.contig.id, seg.left)) covs = [[0] * params.maxCoverageThreshold for i in range(100)] for contig, it in itertools.groupby(res, key = lambda seg: seg.contig): segs = list(it) shrink = contig.asSegment().shrink(1000) bad_seg = False for cov, slen in self.covSegments(shrink, segs, 1): if cov < 3: bad_seg = True if bad_seg: continue for i in range(len(covs)): k = 500 + i * 100 for cov, slen in self.covSegments(shrink, segs, k): covs[i][min(cov, len(covs[i]) - 1)] += slen self.recs = [CoverageAnalyser.CoverageRecord(500 + i * 100, covs[i]) for i in range(len(covs)) if covs[i] > 1000]
def __init__(self, disjointigs, aligner): # type: (DisjointigCollection, Aligner) -> None ContigStorage.__init__(self, [], True) self.disjointigs = disjointigs self.aligner = aligner self.items = dict() # type: Dict[str, NewLine] self.cnt = 1 self.listeners = [] # type: List[LineStorageListener] self.name_printer = None
def alsToReads(als): # type: (List[AlignmentPiece]) -> ContigStorage readIds = set() res = ContigStorage() for al in als: if al.seg_from.contig.id in readIds: continue readIds.add(al.seg_from.contig.id) res.add(al.seg_from.contig) return res
def printAlignments(sam_handler, reference_handler, reads_handler): print "Loading reference" cc = ContigStorage(add_rc=False).loadFromFasta(reference_handler, False) print "Loading query" reads = ContigStorage().loadFromFasta(reads_handler, False) print "Loading result" res = [] for rec in sam_parser.Samfile(sam_handler): if rec.query_name in reads.items and cc[rec.tname] is not None: al = AlignmentPiece.FromSamRecord(reads[rec.query_name], cc[rec.tname], rec) if al is None: print rec.query_name, rec.tname continue if al.seg_to.contig not in cc: al = al.rc res.append(al) print "Printing result", len(res) res = sorted(res, key = lambda al: al.seg_to.left) # res = sorted(res, key = lambda al: len(al))[::-1] up = 0 down = 0 for al in res: print al print list(al.splitRead()) s1, s2 = al.asMatchingStrings() up += s1.count("-") down += s2.count("-") s = [] if len(list(al.splitRead())) > 1: nums = [] for al1 in al.splitRead(): nums.append(al1.seg_from.left) nums.append(al1.seg_from.right - 1) cur_num = 0 cur = al.seg_from.left for c in s1: if cur == nums[cur_num] and c != "-": if cur_num % 2 == 0: s.append("[") else: s.append("]") cur_num += 1 else: if cur_num % 2 == 0: s.append("-") else: s.append("+") if c != "-": cur += 1 print "".join(s) print s1 print s2 print up, down
def main(args): dir = args[4] basic.ensure_dir_existance(dir) CreateLog(dir) sys.stdout.info("Starting graph-free recruitment") print " ".join(args) sys.stdout.info("Loading repeat sequences") seqs = ContigStorage().loadFromFasta(open(args[1], "r"), False) sys.stdout.info("Loading reads") reads = ContigStorage().loadFromFasta(open(args[2], "r"), False) k = int(args[3]) recruit(seqs, reads, k, dir) sys.stdout.info("Finised graph-free recruitment")
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher): # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage sys.stdout.info("Initializing lines from init file", lc_file) lines = NewLineStorage(disjointigs, aligner) f = TokenReader(open(lc_file, "r")) n = f.readInt() for i in range(n): id = f.readToken() contig = contigs[id] assert contig.id == id line = lines.addNew(contig.seq, contig.id) read_ids = f.readTokens() for al in aligner.overlapAlign([reads[rid] for rid in read_ids], ContigStorage([line])): if len(al.seg_to) >= min(params.k, len(line) - 100): tmp_line = al.seg_to.contig # type: NewLine tmp_line.addReadAlignment(al) if len(line) < params.k + 200: new_contig, new_als = polisher.polishEnd(list(line.read_alignments), max_extension=params.k + 100 - len(line)) line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als) line.correct_segments.add(line.asSegment().shrink(100)) line.completely_resolved.add(line.asSegment().shrink(100)) line.initial.add(AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment())) sys.stdout.trace("Final list of lines:") for line in lines.unique(): sys.stdout.trace(line, line.completely_resolved) lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w")) lines.alignDisjointigs() sys.stdout.info("Constructing line dot plot") return lines
def alignAndFilter(self, reads, ref_storage, mode): # type: (Iterable[Contig], ContigStorage, str) -> Generator[AlignmentPiece] filter = self.filters[mode] read_storage = ContigStorage(reads, False) als = [] for rec in self.align(read_storage, list(ref_storage.unique()), mode): if rec.is_unmapped: continue if len(als) > 0 and rec.query_name != als[0].seg_from.contig.id: res = list(filter(als)) for al in res: yield al als = [] if len(als) > 0: seq_from = als[0].seg_from.contig else: seq_from = read_storage[rec.query_name] seq_to = ref_storage[rec.tname] tmp = AlignmentPiece.FromSamRecord(seq_from, seq_to, rec) if tmp is not None: if mode == "dotplot": als.extend(tmp.splitRef()) elif (mode == "local"): als.extend(tmp.splitRead()) elif (mode == "ava-pb"): als.extend(tmp.splitRead()) else: als.append(tmp) if len(als) > 0: res = list(filter(als)) for al in res: yield al
def getRelevantAlignments(self, seg, min_overlap): # type: (Segment, int) -> Generator[AlignmentPiece] sys.stdout.trace("Requesting read alignments for", seg, " using palignments") line = seg.contig #type: NewLine reads = ContigStorage() relevant_reads = line.read_alignments.allInter(seg, min_overlap) sys.stdout.trace("Using reads ", relevant_reads) for base_read_al in relevant_reads: for read in self.als.getAlignments(base_read_al.seg_from.contig.id, params.k): reads.add(read) cnt = 0 for al in self.aligner.localAlign(reads, ContigStorage([seg.contig])): if al.seg_to.interSize(seg) > min_overlap and al.__len__() > params.k: yield al cnt += 1 sys.stdout.trace("Request for read alignments for", seg, "yielded", cnt, "alignments")
def printSegs(f, segs): c = ContigStorage().loadFromFasta(open(f, "r"), False) for seg in segs: if seg[2] == 0: seg[2] = len(c[seg[0]]) SeqIO.write(c[seg[0]].segment(seg[1], seg[2]).asContig(), sys.stdout, "fasta")
def checkAlignments(self, seg, als): # type: (Segment,List[AlignmentPiece]) -> None rids = set([al.seg_from.contig.id for al in als]) for al in self.aligner.localAlign(self.reads, ContigStorage([seg.contig])): if al.seg_to.interSize( seg) > params.k and al.seg_from.contig.id not in rids: print "Missing alignment", al
def draw(contigs_file, output_dir, k): aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments"))) CreateLog(output_dir) print "Reading contigs" tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")), key=lambda contig: len(contig)) lens = map(len, tmp)[::-1] print lens contigs = ContigStorage() if lens[1::2] == lens[0::2]: tmp = tmp[0::2] print "Removed extra contigs" for i, contig in enumerate(tmp): print i, contig contigs.add(Contig(contig.seq, str(i))) print "Constructing components" componenets = ExtractRepeatComponents(contigs, aligner, k) print "Components:" for comp in componenets: print comp.segments print comp.alignments for cnt, comp in enumerate(componenets): print "Processing component", cnt print comp.segments # print comp.alignments print "Forming blocks" Block.id_cnt = 0 blocks = CreateBlocks(comp) if len(blocks) == 1: print "Skipping trivial repeat" continue for block in blocks: print "Block", block.id, ":", block.segs for block in blocks: for other in block.out: print block.id, "->", other.id print "Placing blocks on X axis" code = placeX(blocks) if code == 1: print "WARNING: component", cnt, "contains cycle. Aborting visualization." continue print "Placing blocks on Y axis" placeY(blocks, comp.segments) print "Printing figure" SimplePrinter().printBlocks(blocks, sys.stdout) print "Finished printing figure"
def splitSegKmeans(aligner, seg, mult, all_reads_list): polisher = Polisher(aligner, aligner.dir_distributor) all_reads = ContigStorage() base = seg.asContig() tmp = [] rtv = readsToVectors(aligner, all_reads_list, base) kmeans = KMeans(n_clusters=mult, precompute_distances=True) recs = list(rtv.values()) result = kmeans.fit_predict(X=[rec.v for rec in recs]) print result clusters = dict() for i, c in enumerate(result): if c not in clusters: clusters[c] = [] clusters[c].append(recs[i].al) for c in clusters.values(): print str(c), ":", len(c) split_contigs = [] split_reads = [] for c in clusters.values(): split_contigs.append( Contig( polisher.polishSmallSegment(base.asSegment(), c).seg_from.Seq(), str(len(split_contigs)))) split_reads.append([al.seg_from.contig for al in c]) maxpi = 1 for i in range(mult): for j in range(mult): if i == j: sys.stdout.write("1.0 ") continue al = aligner.overlapAlign([split_contigs[i]], ContigStorage([split_contigs[j] ])).next() sys.stdout.write(str(al.percentIdentity()) + " ") maxpi = max(maxpi, al.percentIdentity()) print "" print "Maxpi:", maxpi if maxpi < 0.985: return zip(split_contigs, split_reads) else: return None
def polishSmallSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece ok = False for al in als: if al.seg_to.contains(seg): ok = True if not ok: sys.stdout.log(common.log_params.LogPriority.warning, "Warning", seg, "has no covering reads") return AlignmentPiece.Identical(seg.asContig().asSegment(), seg) reads = [] start = basic.randomSequence(200) end = basic.randomSequence(200) for al in als: new_seq = "" al = al.reduce(target=seg) if al.seg_to.left < seg.left + 20: new_seq += start new_seq += al.seg_from.Seq() if al.seg_to.right > seg.right - 20: new_seq += end reads.append(NamedSequence(new_seq, al.seg_from.contig.id)) base = Contig(start + seg.Seq() + end, "base") polished = None try: polished = Contig(self.polish(reads, base), "polished") except PolishException: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "has a sequence very different from reads. Using reads to correct." ) for al, read in zip(als, reads): if al.seg_to.contains(seg): try: polished = Contig( self.polish(reads, Contig(read.seq, read.id)), "polished") break except PolishException: pass if polished is None: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "could not be corrected even though some reads cover it.") polished = seg.asContig() als = list(self.aligner.overlapAlign([polished], ContigStorage([base]))) for al in als: if al.seg_from.left < 10 and al.rc.seg_from.left < 10: mapping = AlignmentPiece.Identical( base.segment(len(start), len(base) - len(end)), seg) return al.compose(mapping) assert False, "No alignment from polished to base: " + str(als)
def readsToVectors(aligner, reads_list, base): als = [] rtv = dict() polisher = Polisher(aligner, aligner.dir_distributor) for al in fixAlDir(aligner.overlapAlign(reads_list, ContigStorage([base])), base): if len(al.seg_to) < len(base) - 100: continue else: als.append(al) rtv[al.seg_from.contig.id] = ReadRecord(al).extend(toVector(al)) reads_list = [al.seg_from.contig for al in als] bases = [base] for base_al1, base_al2, base_al3 in zip(als[0::3], als[1::3], als[2::3]): base_candidate = Contig( polisher.polishSmallSegment( base.asSegment(), [base_al1, base_al2, base_al3]).seg_from.Seq(), str(len(bases))) rtr_als = [] read_ids = set() # base_candidate = base_al.seg_from.asContig() for al in fixAlDir( aligner.overlapAlign(reads_list, ContigStorage([base_candidate])), base_candidate): if len(al.seg_to) < len(base_candidate) - 100: continue else: rtr_als.append(al) read_ids.add(al.seg_from.contig.id) if len(read_ids) == len(als): bases.append(base_candidate) for al in rtr_als: rtv[al.seg_from.contig.id].extend(toVector(al)) if len(bases) > 10: break for rec in rtv.values(): print rec.read.id, len(rec.v), rec.v return rtv
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher): # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage sys.stdout.info("Initializing lines from init file", lc_file) lines = NewLineStorage(disjointigs, aligner) f = TokenReader(open(lc_file, "r")) n = f.readInt() for i in range(n): id = f.readToken() contig = contigs[id] assert contig.id == id line = lines.addNew(contig.seq, contig.id) read_ids = f.readTokens() als = [] line_reads = [reads[rid] for rid in read_ids] if len(line_reads) == 0: sys.stdout.warn("No read alignments in initialization for line", line.id, "Realigning all reads") line_reads = reads for al in aligner.overlapAlign(line_reads, ContigStorage([line])): if len(al.seg_to) >= min(1500, len(line) - 100): als.append(al) als = sorted(als, key=lambda al: (al.seg_from.contig.id, -int( al.percentIdentity() * 100), -len(al))) for key, read_als in itertools.groupby( als, key=lambda al: al.seg_from.contig.id): al = list(read_als)[0] tmp_line = al.seg_to.contig # type: NewLine tmp_line.addReadAlignment(al) correct_seg = line.asSegment().shrink(100) if len(line) < params.k + 200: new_contig, new_als = polisher.polishEnd( list(line.read_alignments), max_extension=params.k + 100 - len(line)) line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als) if len(correct_seg) < params.k: correct_seg = correct_seg.expandRight(params.k - len(correct_seg)) line.correct_segments.add(correct_seg) line.completely_resolved.add(correct_seg) line.initial.add( AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment())) sys.stdout.trace("Final list of lines:") for line in lines.unique(): sys.stdout.trace(line, line.completely_resolved) lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w")) lines.alignDisjointigs() sys.stdout.info("Constructing line dot plot") return lines
def __init__(self, genome="", letter_size=550, error_rate=0.05, mutation_rate=0.005, seed=0): random.seed(seed) self.reads = [] # type: List[NamedSequence] self.disjointigs = [] # type: List[NamedSequence] self.contigs = [] # type: List[NamedSequence] self.letter_size = letter_size self.error_rate = error_rate self.mutation_rate = mutation_rate self.alphabet = ContigStorage() self.matches = dict() for c1, c2 in zip(ascii_lowercase, ascii_uppercase): seq = self.generate(self.letter_size) self.alphabet.add(Contig(seq, c1)) seq, matches = self.mutate(seq, self.mutation_rate) self.alphabet.add(Contig(seq, c2)) self.matches[c1] = matches self.matches[c2] = [(b, a) for a, b in matches] self.genome = Contig(self.translate(genome), genome)
def collectRecords(self, corrected): # type: (List[Segment]) -> List[LineExtender.Record] sys.stdout.trace("Collecting records", corrected) read_bounds = dict() records = dict() # type: Dict[Segment, LineExtender.Record] good_reads = set() for seg in corrected: sys.stdout.trace("Oppa initial:", seg) seg = seg.expandLeft(params.k) sys.stdout.trace("Alignments relevant for", seg, list(self.dot_plot.allInter(seg))) for al in self.dot_plot.allInter(seg): seg1 = al.matchingSequence().mapSegUp(al.seg_from.contig, seg) line = al.seg_from.contig # type:NewLine for seg_correct in line.correct_segments.allInter(al.seg_from): for seg_resolved in line.completely_resolved.allInter( seg_correct): if seg_resolved in records: continue if seg_resolved.right == len(line): next_start = len(line) else: next = line.completely_resolved.find( line.asSegment().suffix( pos=seg_resolved.right), 1) if next is None: next_start = len(line) else: next_start = next.left next_start = min(next_start, len(line) - 200) focus = line.segment( max(seg_resolved.left, min(seg_resolved.right - params.k, seg1.left)), min(seg_correct.right, next_start + params.k)) als = list(line.getRelevantAlignmentsFor(focus)) reads = ContigStorage() for al in als: reads.add(al.seg_from.contig) als = list( self.aligner.localAlign(reads.unique(), ContigStorage([line]))) final_als = [] sys.stdout.trace("Focus:", focus, seg_resolved) sys.stdout.trace(als) for al in als: if al.seg_to.contig == line.rc: al = al.rc if al.seg_to.interSize(focus) >= params.k - 100: final_als.append(al) sys.stdout.trace(final_als) sys.stdout.trace("Finished realignment of reads") records[seg_resolved] = self.createRecord( seg_resolved, next_start, seg_correct, final_als, good_reads, read_bounds) records = list(records.values()) # type: List[LineExtender.Record] return records
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump): # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None sys.stdout.info("Extending short lines") short_contigs = ContigStorage() als = dict() # type: Dict[str, List[AlignmentPiece]] for contig in contigs.unique(): if len(contig) < params.k + 500: short_contigs.add(contig) als[contig.id] = [] als[contig.rc.id] = [] if read_dump is not None: sys.stdout.trace("Using flye read dump file to extend short contigs") relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads) for contig in short_contigs: for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])): als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) else: sys.stdout.trace("Realigning all reads to extend short contigs") for al in aligner.overlapAlign(reads, short_contigs): if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20: added = False for i, al1 in enumerate(als[al.seg_to.contig.id]): if al1.seg_from.contig.id == al.seg_from.contig.id: added = True if al.percentIdentity() > al1.percentIdentity(): als[al.seg_to.contig.id][i] = al als[al.seg_to.contig.rc.id][i] = al.rc break if not added: als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) for contig in short_contigs.unique(): if len(als[contig.id]) > 0: tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig)) r = len(tmp_contig) - len(contig) tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig)) l = len(tmp_contig) - len(contig) - r else: tmp_contig, new_als = contig, als[contig.id] l = 0 r = 0 # if l > params.k / 2 and r > params.k / 2: # tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2] # else: # tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)] if len(tmp_contig) > params.k + 500: sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right") contigs.add(Contig(tmp_contig.rc.seq, contig.id)) else: sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.") contigs.remove(contig)
def evaluatePI(dir, contigs_file, initial_file, ref_file): basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False) ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False) segs = [] for al in aligner.overlapAlign(initial.unique(), contigs): if basic.isCanonocal(al.seg_to.contig.id): segs.append(al.seg_to) else: segs.append(al.rc.seg_to) segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id)) interesting = dict() print "Interesting segments:" for contig in contigs: interesting[contig.id] = [contig.asSegment()] for contig, segit in itertools.groupby(segs, lambda seg: seg.contig): csegs = SegmentStorage().addAll(segit) csegs.mergeSegments() csegs = csegs.reverse(contig) interesting[contig.id] = list(csegs) print list(csegs) print "Analysis of contigs" scorer = Scorer() for al in aligner.localAlign(contigs.unique(), ref): print al for seg in interesting[al.seg_from.contig.id]: if al.seg_from.expand(500).contains( seg) or al.seg_from.interSize(seg) > 40000: tmp_al = al.reduce(query=al.seg_from.cap(seg)) scorer.polyshMatching(tmp_al.matchingSequence(), params.score_counting_radius) print tmp_al.seg_from, tmp_al.seg_to, str(events) print "" print "Analysis of initial" for al in aligner.overlapAlign(initial, ref): scorer.polyshMatching(al.matchingSequence(), params.score_counting_radius) print al.seg_from, al.seg_to, str(events)
def iter_align(aligner, contig1, contig2): als = sorted(aligner.localAlign([contig1], ContigStorage([contig2])), key=lambda al: al.seg_from.left) split = [al.splitRef() for al in als] als = [ al for al in itertools.chain(*split) if len(al) > 400 and al.seg_from.contig == contig1 and al.seg_to.contig == contig2 ] als = largestSubseq(als) res = [] if len(als) > 0: for al1, al2 in zip(als[:-1], als[1:]): res.append(al1) if al1.seg_from.dist(al2.seg_from) > 400 and al1.seg_from.dist( al2.seg_from) > 400: seg1 = contig1.segment(al1.seg_from.right, al2.seg_from.left) seg2 = contig2.segment(al1.seg_to.right, al2.seg_to.left) tmp = iter_align(aligner, seg1.asContig(), seg2.asContig()) for al in tmp: res.append(al.queryAsSegment(seg1).targetAsSegment(seg2)) res.append(als[-1]) return res
def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None): # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]] if max_extension is None: max_extension = 10000000000 scorer = Scorer() contig = als[0].seg_to.contig max_len = max_extension + len(contig) sys.stdout.trace("Polishing end of", als[0].seg_to.contig) new_contig = contig.asSegment().asContig() relevant_als = [ al.changeTargetContig(new_contig) for al in als if al.rc.seg_to.left < 100 ] finished_als = [] while True: tmp = [] for al in relevant_als: if al.seg_to.inter(new_contig.asSegment().suffix( length=100)) and al.rc.seg_from.left > 100: tmp.append(al) else: finished_als.append(al) relevant_als = tmp if len(relevant_als) < min_cov: break start = "ACGTTCGA" + basic.randomSequence( params.flanking_size) + new_contig.asSegment().suffix( length=min(params.flanking_size, len(new_contig))).Seq() reduced_read_list = [ AlignedRead.new( start + al.seg_from.contig.asSegment().suffix( pos=al.seg_from.right).Seq(), str(i) + "_" + al.seg_from.contig.id) for i, al in enumerate(relevant_als) ] reduced_reads = ReadCollection(reduced_read_list) found = False for base_al in relevant_als: if base_al.rc.seg_from.left < params.flanking_size: continue # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500 base_segment = base_al.seg_from.contig.segment( base_al.seg_from.right, min( len(base_al.seg_from.contig), base_al.seg_from.right + max(params.window_size, params.k))) base = Contig(start + base_segment.Seq(), "base") for read in reduced_read_list: read.clean() polished_base = Contig(self.polish(reduced_reads, base), "polished_base") for al in self.aligner.localAlign( reduced_reads, ContigStorage().addAll([polished_base])): reduced_reads.reads[al.seg_from.contig.id].addAlignment(al) candidate_alignments = [] for read in reduced_read_list: candidate_alignments.append(None) for al in read.alignmentsTo(polished_base.asSegment()): if al.seg_to.left == 0 and ( (candidate_alignments[-1] is None or candidate_alignments[-1].seg_to.right < al.seg_to.right)): candidate_alignments[-1] = al trimmedAlignments = [] for i, al in enumerate(candidate_alignments): assert al is not None, reduced_read_list[i] trimmedAlignments.append(al.trimByQuality(0.4, 100)) contra_index = 0 contra = [] support = len(trimmedAlignments) cutoff_pos = len(start) for al in sorted(trimmedAlignments, key=lambda al: al.seg_to.right): while contra_index < len(contra) and contra[ contra_index].seg_to.right < al.seg_to.right - 50: contra_index += 1 if support >= min_cov and len(contra) - contra_index <= ( 1 - min_cov_frac) * support: cutoff_pos = al.seg_to.right support -= 1 if al.contradictingRTCRight(): contra.append(al) else: sys.stdout.trace("Stopped at:", support, contra_index, (1 - min_cov_frac) * support) break sys.stdout.trace("Positions:", [al.seg_to.right for al in trimmedAlignments]) sys.stdout.trace("Contra:", contra) if cutoff_pos > len(start) + 100: sys.stdout.trace("Chose to use read", base_al.__repr__(), "Extended for", cutoff_pos - len(start), "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) found = True new_contig_candidate = Contig( new_contig.seq + polished_base[len(start):cutoff_pos], "candidate") embedding = AlignmentPiece.Identical( polished_base.segment(len(start), cutoff_pos), new_contig_candidate.asSegment().suffix( pos=len(new_contig))) read_mappings = [] for al1, al2 in zip(candidate_alignments, relevant_als): seg_from = al2.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) seg_to = al1.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) read_mappings.append( AlignmentPiece.Identical(seg_from, seg_to)) embedded_alignments = [] for al1, al2 in zip(candidate_alignments, read_mappings): if al1.seg_to.right <= len(start) + 10: embedded_alignments.append(None) else: tmp = al2.compose(al1) if tmp.seg_to.left > embedding.seg_from.right - 10: embedded_alignments.append(None) else: embedded_alignments.append( tmp.compose(embedding)) corrected_relevant_alignments = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in relevant_als ] relevant_als = [] for al1, al2 in zip(corrected_relevant_alignments, embedded_alignments): if al2 is None: al = al1 else: al = al1.mergeDistant(al2) if al is None: al = al1 elif al1.seg_from.dist( al2.seg_from) >= 10 or al1.seg_to.dist( al2.seg_to) >= 10: al = scorer.polyshAlignment( al, params.alignment_correction_radius) relevant_als.append(al) finished_als = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in finished_als ] new_contig = new_contig_candidate break else: sys.stdout.trace("Could not prolong with read", base_al, "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) if len(new_contig) >= max_len: break if not found: break return new_contig, relevant_als + finished_als
def __init__(self): self.num_iters = params.num_iters self.platform = params.technology self.threads = params.threads if __name__ == "__main__": reads_file = sys.argv[2] consensus_file = sys.argv[3] dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid] corrected_seq = polisher.polishSegment( contig.asSegment(), list(rals)).seg_from.Seq() res.append(Contig(corrected_seq, rid)) else: res = polisher.polishMany(reads, list(ref.unique()))
class TestDataset: def __init__(self, genome="", letter_size=550, error_rate=0.05, mutation_rate=0.005, seed=0): random.seed(seed) self.reads = [] # type: List[NamedSequence] self.disjointigs = [] # type: List[NamedSequence] self.contigs = [] # type: List[NamedSequence] self.letter_size = letter_size self.error_rate = error_rate self.mutation_rate = mutation_rate self.alphabet = ContigStorage() self.matches = dict() for c1, c2 in zip(ascii_lowercase, ascii_uppercase): seq = self.generate(self.letter_size) self.alphabet.add(Contig(seq, c1)) seq, matches = self.mutate(seq, self.mutation_rate) self.alphabet.add(Contig(seq, c2)) self.matches[c1] = matches self.matches[c2] = [(b, a) for a, b in matches] self.genome = Contig(self.translate(genome), genome) def translate(self, seq): return "".join(map(lambda c: self.alphabet[c].seq, seq)) def addRead(self, read_seq): name = "R" + str(len(self.reads)) + "_" + read_seq self.reads.append( NamedSequence( self.mutate(self.translate(read_seq), self.error_rate)[0], name)) return name def addDisjointig(self, disjointig_seq): # type: (str) -> str self.disjointigs.append( NamedSequence( self.mutate(self.translate(disjointig_seq), self.mutation_rate)[0], "D" + str(len(self.disjointigs)) + "_" + disjointig_seq)) return self.disjointigs[-1].id def addContig(self, contig_seq): # type: (str) -> str name = "C" + str(len(self.contigs)) + "_" + contig_seq self.contigs.append(NamedSequence(self.translate(contig_seq), name)) return name def generateReads(self, length=5, cov=15, circular=False): genome = self.genome.id if circular: genome = genome + genome[0:length - 1] for i in range(0, len(genome) - length + 1): for j in range((cov + length - 1) / length): self.addRead(genome[i:i + length]) def generate(self, letter_size): # type: (int) -> str return "".join( [random.choice(["A", "C", "G", "T"]) for i in range(letter_size)]) def genAll(self, aligner): # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection] disjointigs = DisjointigCollection() for dis in self.disjointigs: disjointigs.addNew(dis.seq, dis.id) from disjointig_resolve.line_storage import NewLineStorage lines = NewLineStorage(disjointigs, aligner) lines.name_printer = lambda line: line.id + "_" + self.translateBack( line, aligner) for line in self.contigs: new_line = lines.addNew(line.seq, line.id) new_line.initial.add( AlignmentPiece.Identical( new_line.asSegment().asContig().asSegment(), new_line.asSegment())) dp = LineDotPlot(lines, aligner) dp.construct(aligner) lines.alignDisjointigs() reads = ReadCollection() for read in self.reads: reads.addNewRead(read) disjointigs.addAlignments(aligner.localAlign(reads, disjointigs)) return lines, dp, reads def mutate(self, seq, rate): # type: (str, float) -> Tuple[str, List[Tuple[int, int]]] res = [seq[0]] matches = [] matches.append((0, 0)) cur = 1 for i, c in enumerate(seq): if i == 0 or i == len(seq) - 1: continue if random.random() < rate: vars = ["A", "C", "G", "T"] vars.remove(c) res.append(random.choice([random.choice(vars), "", c + c])) cur += len(res[-1]) else: res.append(c) matches.append((cur, i)) cur += 1 res.append(seq[-1]) matches.append((len(seq) - 1, cur)) return "".join(res), matches def saveStructure(self, handler): # type: (TokenWriter) -> None handler.writeToken(self.genome.id) handler.writeInt(len(self.reads)) for read in self.reads: handler.writeToken(read.id.split("_")[-1]) handler.writeInt(len(self.disjointigs)) for disjointig in self.disjointigs: handler.writeToken(disjointig.id.split("_")[-1]) handler.writeInt(len(self.contigs)) for contig in self.contigs: handler.writeToken(contig.id.split("_")[-1]) @staticmethod def loadStructure(handler): # type: (TokenReader) -> TestDataset random.seed(0) res = TestDataset(handler.readToken()) for i in range(handler.readInt()): res.addRead(handler.readToken()) for i in range(handler.readInt()): res.addDisjointig(handler.readToken()) for i in range(handler.readInt()): res.addContig(handler.readToken()) return res def translateBack(self, contig, aligner): # type: (Contig, Aligner) -> str res = [] for al in sorted(aligner.overlapAlign([contig], self.alphabet), key=lambda al: al.seg_from.left): if len(res) > 0 and al.seg_from.interSize( res[-1].seg_from) > self.letter_size / 2: if al.percentIdentity() > res[-1].percentIdentity(): res[-1] = al else: res.append(al) return "".join([al.seg_to.contig.id for al in res])
def __init__(self): ContigStorage.__init__(self, [], True) self.items = self.items # type: Dict[str, Disjointig] self.cnt = 1
def assemble(args, bin_path): params.bin_path = bin_path start = time.time() cl_params = Params().parse(args) ref = ContigStorage() if cl_params.test: cl_params.reads_file = os.path.dirname(__file__) + "/../../test_dataset/reads.fasta" cl_params.genome_size = 30000 cl_params.dir = os.path.dirname(__file__) + "/../../test_results" ref.loadFromFile(os.path.dirname(__file__) + "/../../test_dataset/axbctbdy.fasta", False) if cl_params.debug: params.save_alignments = True cl_params.check() CreateLog(cl_params.dir) sys.stdout.info("Command line:", " ".join(cl_params.args)) sys.stdout.info("Started") if cl_params.debug: sys.stdout.info("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) sys.stdout.info("Modifications:") print subprocess.check_output(["git", "diff"]) sys.stdout.info("Preparing initial state") if cl_params.debug: save_handler = SaveHandler(os.path.join(cl_params.dir, "saves")) else: save_handler = None if cl_params.load_from is not None: # tmp = cl_params.focus sys.stdout.info("Loading initial state from saves") cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(TokenReader(open(cl_params.load_from, "r"))) cl_params.parse(args) # cl_params.focus = tmp knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) dot_plot.printAll(sys.stdout) printState(lines) else: aligner = Aligner(DirDistributor(cl_params.alignmentDir())) polisher = Polisher(aligner, aligner.dir_distributor) reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads, cl_params.downsample) if cl_params.contigs_file is None: sys.stdout.info("Running Flye") assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) subprocess.check_call([os.path.join(params.bin_path, "flye"), "--meta", "-o", assembly_dir, "-t", str(cl_params.threads), "--" + params.technology + "-raw", reads_file, "--genome-size", str(cl_params.genome_size), "--min-overlap", str(params.k)]) cl_params.set_flye_dir(assembly_dir, cl_params.mode) elif len(cl_params.disjointigs_file_list) == 0: assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) disjointigs_file = constructDisjointigs(reads, params.expected_size, assembly_dir) # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir) cl_params.disjointigs_file_list.append(disjointigs_file) params.min_contra_for_break = 8 disjointigs = CreateDisjointigCollection(cl_params.disjointigs_file_list, cl_params.dir, aligner, reads) all_unique = cl_params.init_file is not None contigs = CreateContigCollection(cl_params.graph_file, cl_params.contigs_file, cl_params.min_cov, aligner, polisher, reads, cl_params.force_unique, all_unique) if cl_params.autoKL: adjustKL(aligner, reads, contigs) if cl_params.init_file is None: ExtendShortContigs(contigs, reads, aligner, polisher, cl_params.read_dump) lines = CreateLineCollection(cl_params.dir, aligner, contigs, disjointigs, reads, cl_params.split) else: lines = LoadLineCollection(cl_params.dir, cl_params.init_file, aligner, contigs, disjointigs, reads, polisher) sys.stdout.info("Constructing dot plot") dot_plot = LineDotPlot(lines, aligner) dot_plot.construct(aligner) # dot_plot.printAll(sys.stdout) sys.stdout.info("Updating sequences and resolved segments.") knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) extender.updateAllStructures(itertools.chain.from_iterable(line.completely_resolved for line in lines)) for line in list(lines.unique()): # type: NewLine line.completely_resolved.mergeSegments() if len(line.completely_resolved) == 0: lines.removeLine(line) if cl_params.debug: sys.stdout.info( "Saving initial state") try: writer = save_handler.getWriter() sys.stdout.info("Save details:", writer.info) saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot) except Exception as e: _, _, tb = sys.exc_info() sys.stdout.warn("Could not write save") traceback.print_tb(tb) sys.stdout.INFO( "Message:", e.message) sys.stdout.trace( "Disjointig alignments") for line in lines: sys.stdout.trace( line.disjointig_alignments) sys.stdout.info("Starting expanding alignment-consensus loop") EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines, reads, save_handler) dot_plot.printAll(sys.stdout) sys.stdout.trace( "Final result:") lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w")) lines.printKnottedToFasta(open(os.path.join(cl_params.dir, "assembly.fasta"), "w")) printState(lines) sys.stdout.info("Finished") secs = int(time.time() - start) days = secs / 60 / 60 / 24 hours = secs / 60 / 60 % 24 mins = secs / 60 % 60 sys.stdout.info("Finished in %d days, %d hours, %d minutes" % (days, hours, mins)) if cl_params.test: passed = False for al in aligner.dotplotAlign(lines, ref): if len(al) > len(al.seg_to.contig) - 3000: passed = True break if passed: sys.stdout.info("Test passed") else: sys.stdout.info("Test failed")
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs
import sys sys.path.append("py") from common import basic, params from common.basic import CreateLog from alignment.align_tools import Aligner, DirDistributor from common.line_align import Scorer from common.sequences import ContigStorage if __name__ == "__main__": basic.ensure_dir_existance(sys.argv[1]) CreateLog(sys.argv[1]) reads = ContigStorage().loadFromFile(sys.argv[2]) contigs = ContigStorage().loadFromFile(sys.argv[3]) scorer = Scorer() dd = DirDistributor(sys.argv[1]) aligner = Aligner(dd) for read in reads.unique(): print "Processing read", read als = [ scorer.polyshAlignment(al, params.alignment_correction_radius) for al in aligner.localAlign([read], contigs) ] for al1 in als: for al2 in als: if al1.seg_to.contig == al2.seg_to.contig: continue print al1, "vs", al2 scorer.scoreInCorrectSegments(al1,
def prolong(aligner, polisher, contig, reads): als = list(aligner.overlapAlign(reads.unique(), ContigStorage([contig]))) contig, als = polisher.polishEnd(fixAlDir(als, contig), min_cov=5) contig, als = polisher.polishEnd([al.rc for al in als], min_cov=5) return contig
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"