def printSegs(f, segs): c = ContigStorage().loadFromFasta(open(f, "r"), False) for seg in segs: if seg[2] == 0: seg[2] = len(c[seg[0]]) SeqIO.write(c[seg[0]].segment(seg[1], seg[2]).asContig(), sys.stdout, "fasta")
def printKnottedToFasta(self, handler): # type: (BinaryIO) -> None printed = set() cnt = 1 for chain in self.chains(): if chain[0].rc.id in printed: continue for line in chain: printed.add(line.id) seq = [] id = [] if chain[-1].knot is not None: id.append("Circular") for line in chain: id.append(line.id) if line.knot is not None: id.append(str(line.knot.gap)) if line.knot.gap < 0: seq.append(line.seq[:line.knot.gap]) else: seq.append(line.seq) seq.append(line.knot.gap_seq) else: seq.append(line.seq) sys.stdout.trace(cnt, ":", ";".join(id)) SeqIO.write(NamedSequence("".join(seq), "contig_" + str(cnt)), handler, "fasta") cnt += 1
def simulate1(dir, mutation, genome): print "Simulating", genome ds = dataset_simulation.TestDataset(genome, 5000, mutation_rate=mutation) genome_seq = ds.mutate(ds.genome, mutation / 2)[0] f = open(os.path.join(dir, genome + str(mutation * 100) + ".fasta"), "w") SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta") f.close()
def get_max_reads_length(reads_file, log, num_checked): file_type = SeqIO.get_read_file_type(reads_file) if not file_type: error('Incorrect extension of reads file: ' + reads_file, log) max_reads_length = max([len(rec) for rec in itertools.islice(SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked)]) log.info(reads_file + ': max reads length: ' + str(max_reads_length)) return max_reads_length
def prepare_disjointigs_file(disjointigs_file, disjointigs_file_list): recs = [] for fn in disjointigs_file_list: for rec in SeqIO.parse_fasta(open(fn, "r")): recs.append(rec) h = open(disjointigs_file, "w") for rec in recs: SeqIO.write(rec, h, "fasta") h.close()
def collect_contigs(dataset, barcodes_dir, output_base, format): output = open(output_base + "." + format, "w") for barcode in dataset: file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format) if os.path.exists(file): contigs = SeqIO.parse(open(file), format) for contig in contigs: contig.id = barcode.id + "-" + contig.id SeqIO.write(contig, output, format) output.close()
def PrintResults(recs, reference, references_file, coordinates_file): aln = open(coordinates_file, "w") fasta = open(references_file, "w") for rec in recs: aln.write(str(rec) + "\n") sequence = reference[rec.rname][rec.left:rec.right] rec_id = str(rec.rname) + "_(" + str(rec.left) + "-" + str(rec.right)+")" SeqIO.write(SeqIO.SeqRecord(sequence, rec_id), fasta, "fasta") aln.close() fasta.close()
def get_read_file_type(input_filename, log=None): if input_filename in options_storage.dict_of_prefixes: ext = options_storage.dict_of_prefixes[input_filename] file_type = SeqIO.get_read_file_type("filename" + ext) else: file_type = SeqIO.get_read_file_type(input_filename) if not file_type: error("incorrect extension of reads file: %s" % input_filename, log) return file_type
def check_file_not_empty(input_filename, message="", log=None): filename = abspath(expanduser(input_filename)) file_type = get_read_file_type(input_filename, log) if (file_type == 'bam'): return try: reads_iterator = SeqIO.parse(SeqIO.Open(filename, "r"), file_type) if next(reads_iterator, None) is None: error("file is empty: %s (%s)" % (filename, message), log=log) except Exception as inst: error(inst.args[0].format(FILE=filename) + "\n\n" + traceback.format_exc().format(FILE=filename), log=log)
def loadFromFasta(self, handler, save_names=False, int_ids=False, filter=lambda rec: True): # type: (BinaryIO, bool, bool, Callable[[NamedSequence], bool]) -> DisjointigCollection recs = list(SeqIO.parse_fasta(handler)) if save_names: for rec in recs: assert rec.id not in self.items.keys() and basic.Reverse( rec.id) not in self.items.keys() for rec in recs: if not filter(rec): continue if save_names: number = basic.parseNegativeNumber(rec.id) if number is not None: self.cnt = max(self.cnt, int(abs(number)) + 1) if int_ids: self.addNew(rec.seq, str(number)) else: self.addNew(rec.seq) else: self.addNew(rec.seq) return self
def main(contig_file, reads_file, sam_file, dir, contig_id): # type: (str, str, str, str, str) -> None basic.ensure_dir_existance(dir) contigs = ContigCollection() contigs.loadFromFasta(open(contig_file, "r")) print "Contigs loaded" contig = contigs[contig_id] read_names = set() for rec in Samfile(open(sam_file, "r")): read_names.add(rec.query_name) reads = ReadCollection() cnt = 0 for rec in SeqIO.parse_fasta(open(reads_file, "r")): if rec.id in read_names: rec.id = "Read" + str(cnt) reads.add(AlignedRead(rec)) cnt += 1 reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w")) print "Reads loaded", len(reads) reads.addAllRC() print "RC added", len(reads) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) aligner.alignReadCollection(reads, contigs) print "Reads aligned", len(reads) reads = reads.inter(contig.asSegment()) print "Reads filtered", len(reads) sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left) for read in sorted_reads: print read for al in read.alignmentsTo(contig.asSegment()): print "\n".join(al.asMatchingStrings())
def get_max_reads_length(reads_file, log, num_checked): if reads_file in options_storage.dict_of_prefixes: ext = options_storage.dict_of_prefixes[reads_file] file_type = SeqIO.get_read_file_type(ext) else: file_type = SeqIO.get_read_file_type(reads_file) if not file_type: error("incorrect extension of reads file: %s" % reads_file, log) max_reads_length = max([ len(rec) for rec in itertools.islice( SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked) ]) log.info("%s: max reads length: %s" % (reads_file, str(max_reads_length))) return max_reads_length
def get_max_reads_length(reads_file, log, num_checked): file_type = get_read_file_type(reads_file, log) max_reads_length = 0 try: max_reads_length = max([ len(rec) for rec in itertools.islice( SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked) ]) except Exception as inst: error(inst.args[0].format(FILE=reads_file) + "\n\n" + traceback.format_exc().format(FILE=reads_file), log=log) else: log.info("%s: max reads length: %s" % (reads_file, str(max_reads_length))) return max_reads_length
def readsN50(dir): for fn in os.listdir(dir): tmp = [] f = os.path.join(dir, fn, fn + ".fasta") for rec in SeqIO.parse_fasta(open(f, "r")): if len(tmp) >= 1000: break tmp.append(len(rec)) print fn, sorted(tmp)[len(tmp) / 2]
def simulate2(dir, mutation, error_rate, genome): print "Simulating", genome ds = dataset_simulation.TestDataset(genome, 4000, mutation_rate=mutation) genome_seq = ds.mutate(ds.genome, mutation / 2)[0] total = 0 f = open(os.path.join(dir, genome + ".fasta"), "w") SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta") f.close() f = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 while total < len(genome_seq) * 30: l = random.randint(3000, 3500) pos = random.randint(0, len(genome_seq) - l) seq = ds.mutate(genome_seq[pos:pos + l], error_rate)[0] SeqIO.write(NamedSequence(seq, str(cnt)), f, "fasta") cnt += 1 total += len(seq) f.close()
def loadFromFasta(self, handler, num_names=True): # type: (BinaryIO, bool) -> ContigCollection for rec in SeqIO.parse_fasta(handler): if num_names: self.add( Contig(rec.seq, str(basic.parseNegativeNumberAndMod(rec.id)))) else: self.add(Contig(rec.seq, rec.id)) return self
def main(ref_file, contig_size, rlen, cov, dir): basic.ensure_dir_existance(dir) all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False) contig_file_name = os.path.join(dir, "contigs.fasta") contig_file = open(contig_file_name, "w") reads_file_name = os.path.join(dir, "reads.fasta") reads_file = open(reads_file_name, "w") for ref in all_contigs.unique(): if len(ref) < contig_size: continue SeqIO.write(ref, contig_file, "fasta") for i in range(0, len(ref), max(1, rlen / cov)): read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence() SeqIO.write(read, reads_file, "fasta") reads_file.close() contig_file.close() print "Done" print contig_file_name print reads_file_name
def loadFromFile(self, fname, num_names=True): # type: (str, bool) -> ContigCollection for rec in SeqIO.parse_by_name(fname): if num_names: self.add( Contig(rec.seq, str(basic.parseNegativeNumberAndMod(rec.id)))) else: self.add(Contig(rec.seq, rec.id)) return self
def CheckSequences(self, reads, reads_file): # type: (Iterable[NamedSequence], str) -> bool if not os.path.exists(reads_file): return False try: for rec, read in itertools.izip_longest(SeqIO.parse_fasta(open(reads_file, "r")), reads): if str(rec.id) != str(read.id) or rec.seq != read.seq: return False return True except: return False
def FillSeq(self, f, numeric=True): for s in SeqIO.parse_fasta(open(f, "r")): if numeric: s.id = str(basic.parseNumber(s.id)) if s.id in self.e: self.e[s.id].seq = s.seq self.e[s.id].len = len(s.seq) if "-" + s.id in self.e: self.e["-" + s.id].seq = basic.RC(s.seq) self.e["-" + s.id].len = len(s.seq) return self
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
def FillSeq(self, f, numeric=True): for s in SeqIO.parse_fasta(open(f, "r")): if numeric: s.id = str(basic.parseNumber(s.id)) if s.id in self.e: self.e[s.id].seq = s.seq self.e[s.id].len = len(s.seq) if basic.Reverse(s.id) in self.e: self.e[basic.Reverse(s.id)].seq = basic.RC(s.seq) self.e[basic.Reverse(s.id)].len = len(s.seq) for edge in self.e.values(): assert (edge.seq is not None) return self
def align(dir, contigs_file): CreateLog(dir) contigs = list(SeqIO.parse_fasta(open(contigs_file, "r"))) assert len(contigs) == 2 contigs = [ Contig(contigs[0].seq, contigs[0].id), Contig(contigs[1].seq, contigs[1].id) ] aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) als = iter_align(aligner, contigs[0], contigs[1]) printVar(os.path.join(dir, "diff.txt"), als) for al in als: print al
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def polishMany(self, reads, sequences): # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig] dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")]) consensus_file_name = new_files[0] reads_file_name = new_files[1] args = FakePolishingArgs() basic.ensure_dir_existance(os.path.join(dir, "work")) job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish") polished_file = job.out_files["contigs"] if same and not params.clean and os.path.exists(polished_file): sys.stdout.trace("Polishing reused:", polished_file) else: sys.stdout.trace("Running polishing:", polished_file) job.run() return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
def polish(self, reads, consensus): # type: (Iterable[NamedSequence], Contig) -> str dir, new_files, same = self.dir_distributor.fillNextDir([([consensus], "ref.fasta"), (reads, "reads.fasta")]) consensus_file_name = new_files[0] reads_file_name = new_files[1] args = FakePolishingArgs() basic.ensure_dir_existance(os.path.join(dir, "work")) job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish") polished_file = job.out_files["contigs"] if same and not params.clean and os.path.exists(polished_file): sys.stdout.trace("Polishing reused:", polished_file) else: sys.stdout.trace("Running polishing:", polished_file) job.run() return list(SeqIO.parse_fasta(open(polished_file, "r")))[0].seq
def moleculo_postprocessing(contigs_file, output_file, sam_files, log): log.info("===== Starting postprocessing based on read alignment") log.info("Processing scaffolds from " + contigs_file) log.info("Using read alignments to break and filter scaffolds") contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta")) sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files]) generate_quality.GenerateQuality(contigs, sam) pattern_filter = moleculo_filter_contigs.PatternContigFilter(contigs, sam, pattern, rc_pattern) length_filter = moleculo_filter_contigs.ContigLengthFilter(1500) coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50) pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150) n_breaker = break_by_coverage.NBreaker(3) result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter) OutputResults(output_file, "fasta", result) OutputResults(output_file, "fastq", result) log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")
def draw(contigs_file, output_dir, k): aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments"))) CreateLog(output_dir) print "Reading contigs" tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")), key=lambda contig: len(contig)) lens = map(len, tmp)[::-1] print lens contigs = ContigStorage() if lens[1::2] == lens[0::2]: tmp = tmp[0::2] print "Removed extra contigs" for i, contig in enumerate(tmp): print i, contig contigs.add(Contig(contig.seq, str(i))) print "Constructing components" componenets = ExtractRepeatComponents(contigs, aligner, k) print "Components:" for comp in componenets: print comp.segments print comp.alignments for cnt, comp in enumerate(componenets): print "Processing component", cnt print comp.segments # print comp.alignments print "Forming blocks" Block.id_cnt = 0 blocks = CreateBlocks(comp) if len(blocks) == 1: print "Skipping trivial repeat" continue for block in blocks: print "Block", block.id, ":", block.segs for block in blocks: for other in block.out: print block.id, "->", other.id print "Placing blocks on X axis" code = placeX(blocks) if code == 1: print "WARNING: component", cnt, "contains cycle. Aborting visualization." continue print "Placing blocks on Y axis" placeY(blocks, comp.segments) print "Printing figure" SimplePrinter().printBlocks(blocks, sys.stdout) print "Finished printing figure"
dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid] corrected_seq = polisher.polishSegment( contig.asSegment(), list(rals)).seg_from.Seq() res.append(Contig(corrected_seq, rid)) else: res = polisher.polishMany(reads, list(ref.unique())) res_file = os.path.join(dir, "res.fasta") rf = open(res_file, "w") for c in res: SeqIO.write(c, rf, "fasta") rf.close() aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap", os.path.join(dir, "res.sam"))
cur = None dump = open(sys.argv[2]).readlines() d = dict() for s in dump: s = s.strip() if s == "": continue if s.startswith("#"): s = s[1:].split() if s[0] == "Repeat": repeat = s[1] if s[0] in ["All", "Input", "Output"]: cur = s[1] else: if repeat not in interest: continue sign = s[0] s = s[1:] if s not in d: d[s] = [] d[s].append((repeat, cur, sign)) for rec in SeqIO.parse_fasta(open(sys.argv[1])): id = rec.id.split()[0] if id in d: tmp = d[id] if ("reads", "-") in [(a[1], a[2]) for a in tmp]: rec.seq = RC(rec.seq) SeqIO.write( common.seq_records.SeqRecord(rec.seq, id + "_" + str(d[id])), sys.stdout, "fasta") sys.stderr.write(id + "_" + str(d[id]) + "\n")