def main(contig_file, reads_file, sam_file, dir, contig_id): # type: (str, str, str, str, str) -> None basic.ensure_dir_existance(dir) contigs = ContigCollection() contigs.loadFromFasta(open(contig_file, "r")) print "Contigs loaded" contig = contigs[contig_id] read_names = set() for rec in Samfile(open(sam_file, "r")): read_names.add(rec.query_name) reads = ReadCollection() cnt = 0 for rec in SeqIO.parse_fasta(open(reads_file, "r")): if rec.id in read_names: rec.id = "Read" + str(cnt) reads.add(AlignedRead(rec)) cnt += 1 reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w")) print "Reads loaded", len(reads) reads.addAllRC() print "RC added", len(reads) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) aligner.alignReadCollection(reads, contigs) print "Reads aligned", len(reads) reads = reads.inter(contig.asSegment()) print "Reads filtered", len(reads) sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left) for read in sorted_reads: print read for al in read.alignmentsTo(contig.asSegment()): print "\n".join(al.asMatchingStrings())
def loadFromFasta(self, handler, save_names=False, int_ids=False, filter=lambda rec: True): # type: (BinaryIO, bool, bool, Callable[[NamedSequence], bool]) -> DisjointigCollection recs = list(SeqIO.parse_fasta(handler)) if save_names: for rec in recs: assert rec.id not in self.items.keys() and basic.Reverse( rec.id) not in self.items.keys() for rec in recs: if not filter(rec): continue if save_names: number = basic.parseNegativeNumber(rec.id) if number is not None: self.cnt = max(self.cnt, int(abs(number)) + 1) if int_ids: self.addNew(rec.seq, str(number)) else: self.addNew(rec.seq) else: self.addNew(rec.seq) return self
def readsN50(dir): for fn in os.listdir(dir): tmp = [] f = os.path.join(dir, fn, fn + ".fasta") for rec in SeqIO.parse_fasta(open(f, "r")): if len(tmp) >= 1000: break tmp.append(len(rec)) print fn, sorted(tmp)[len(tmp) / 2]
def prepare_disjointigs_file(disjointigs_file, disjointigs_file_list): recs = [] for fn in disjointigs_file_list: for rec in SeqIO.parse_fasta(open(fn, "r")): recs.append(rec) h = open(disjointigs_file, "w") for rec in recs: SeqIO.write(rec, h, "fasta") h.close()
def loadFromFasta(self, handler, num_names=True): # type: (BinaryIO, bool) -> ContigCollection for rec in SeqIO.parse_fasta(handler): if num_names: self.add( Contig(rec.seq, str(basic.parseNegativeNumberAndMod(rec.id)))) else: self.add(Contig(rec.seq, rec.id)) return self
def FillSeq(self, f, numeric=True): for s in SeqIO.parse_fasta(open(f, "r")): if numeric: s.id = str(basic.parseNumber(s.id)) if s.id in self.e: self.e[s.id].seq = s.seq self.e[s.id].len = len(s.seq) if "-" + s.id in self.e: self.e["-" + s.id].seq = basic.RC(s.seq) self.e["-" + s.id].len = len(s.seq) return self
def CheckSequences(self, reads, reads_file): # type: (Iterable[NamedSequence], str) -> bool if not os.path.exists(reads_file): return False try: for rec, read in itertools.izip_longest(SeqIO.parse_fasta(open(reads_file, "r")), reads): if str(rec.id) != str(read.id) or rec.seq != read.seq: return False return True except: return False
def FillSeq(self, f, numeric=True): for s in SeqIO.parse_fasta(open(f, "r")): if numeric: s.id = str(basic.parseNumber(s.id)) if s.id in self.e: self.e[s.id].seq = s.seq self.e[s.id].len = len(s.seq) if basic.Reverse(s.id) in self.e: self.e[basic.Reverse(s.id)].seq = basic.RC(s.seq) self.e[basic.Reverse(s.id)].len = len(s.seq) for edge in self.e.values(): assert (edge.seq is not None) return self
def align(dir, contigs_file): CreateLog(dir) contigs = list(SeqIO.parse_fasta(open(contigs_file, "r"))) assert len(contigs) == 2 contigs = [ Contig(contigs[0].seq, contigs[0].id), Contig(contigs[1].seq, contigs[1].id) ] aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) als = iter_align(aligner, contigs[0], contigs[1]) printVar(os.path.join(dir, "diff.txt"), als) for al in als: print al
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def polish(self, reads, consensus): # type: (Iterable[NamedSequence], Contig) -> str dir, new_files, same = self.dir_distributor.fillNextDir([([consensus], "ref.fasta"), (reads, "reads.fasta")]) consensus_file_name = new_files[0] reads_file_name = new_files[1] args = FakePolishingArgs() basic.ensure_dir_existance(os.path.join(dir, "work")) job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish") polished_file = job.out_files["contigs"] if same and not params.clean and os.path.exists(polished_file): sys.stdout.trace("Polishing reused:", polished_file) else: sys.stdout.trace("Running polishing:", polished_file) job.run() return list(SeqIO.parse_fasta(open(polished_file, "r")))[0].seq
def polishMany(self, reads, sequences): # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig] dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")]) consensus_file_name = new_files[0] reads_file_name = new_files[1] args = FakePolishingArgs() basic.ensure_dir_existance(os.path.join(dir, "work")) job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish") polished_file = job.out_files["contigs"] if same and not params.clean and os.path.exists(polished_file): sys.stdout.trace("Polishing reused:", polished_file) else: sys.stdout.trace("Running polishing:", polished_file) job.run() return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
def draw(contigs_file, output_dir, k): aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments"))) CreateLog(output_dir) print "Reading contigs" tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")), key=lambda contig: len(contig)) lens = map(len, tmp)[::-1] print lens contigs = ContigStorage() if lens[1::2] == lens[0::2]: tmp = tmp[0::2] print "Removed extra contigs" for i, contig in enumerate(tmp): print i, contig contigs.add(Contig(contig.seq, str(i))) print "Constructing components" componenets = ExtractRepeatComponents(contigs, aligner, k) print "Components:" for comp in componenets: print comp.segments print comp.alignments for cnt, comp in enumerate(componenets): print "Processing component", cnt print comp.segments # print comp.alignments print "Forming blocks" Block.id_cnt = 0 blocks = CreateBlocks(comp) if len(blocks) == 1: print "Skipping trivial repeat" continue for block in blocks: print "Block", block.id, ":", block.segs for block in blocks: for other in block.out: print block.id, "->", other.id print "Placing blocks on X axis" code = placeX(blocks) if code == 1: print "WARNING: component", cnt, "contains cycle. Aborting visualization." continue print "Placing blocks on Y axis" placeY(blocks, comp.segments) print "Printing figure" SimplePrinter().printBlocks(blocks, sys.stdout) print "Finished printing figure"
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
def extract(fname): res = dict() for read in SeqIO.parse_fasta(open(fname, "r")): res[read.id] = read.seq return res
import sys sys.path.append("py") from common import SeqIO rf = sys.argv[1] outf = sys.argv[2] total = int(sys.argv[3]) print "Reading reads" reads = list(SeqIO.parse_fasta(open(rf, "r"))) print "Sorting reads" reads = sorted(reads, key=lambda read: -len(read)) print "Printing reads" out = open(outf, "w") for read in reads: if total <= 0: break total -= len(read) SeqIO.write(read, out, "fasta") out.close()
def ReadReference(file): result = dict() for rec in SeqIO.parse_fasta(open(file, "r")): result[rec.id] = rec.seq return result
import sys import os import common.seq_records sys.path.append("py") import common.SeqIO as SeqIO read_len = int(sys.argv[2]) cov = float(sys.argv[3]) for seq in SeqIO.parse_fasta(open(sys.argv[1], "r")): sys.stderr.write(seq.id + " " + str(len(seq)) + " " + str(int(len(seq) * cov / read_len)) + "\n") # if len(seq) > 100000000 or len(seq) < 10000000: # continue cur = 100000 for i in range(0, len(seq), int(read_len / cov)): if i > cur: sys.stderr.write(str(cur) + "\n") cur = cur * 3 / 2 SeqIO.write(common.seq_records.SeqRecord(seq.seq[i:min(len(seq), i + read_len)], seq.id + "_" + str(i)), sys.stdout, "fasta") break
vert = dict() cnt = 0 for v in self.v: inc = list(self.inc(v)) out = list(self.out(v)) if (self.incdeg(v) != 1 or self.outdeg(v) != 1 ) and self.incdeg(v) + self.outdeg(v) != 0: vert[v] = cnt cnt += 1 for v in vert: for next in self.out(v): e = self.unique_forward(v[0], next) handler.write( self.ntostr(vert[v]) + " -> " + self.ntostr(vert[e[-self.k:]]) + "[label = " + "\"" + str(len(e) - self.k) + ":" + str(self.cov(e)) + "\"" + ", color = \"black\"] ;\n") handler.write("}\n") sys.stderr.write("starting\n") k = int(sys.argv[2]) edges = ConstructEdges(SeqIO.parse_fasta(open(sys.argv[1], "r")), k, int(sys.argv[3])) sys.stderr.write("constructing dbg\n") g = DBG(edges, k) sys.stderr.write("cleaning") g.clean(20) sys.stderr.write("printing dbg\n") g.print_dot(sys.stdout)
dir = os.path.join(sys.argv[4]) if not os.path.exists(dir): os.makedirs(dir) names = sys.argv[1].split(";") tmp = dict() for s in names: if s.endswith("RC"): tmp[s[:-2]] = True else: tmp[s] = False names = tmp contigs_file = os.path.join(sys.argv[4], "contigs.fasta") contigs_handler = open(contigs_file, "w") for rec in SeqIO.parse_fasta(open(sys.argv[2], "r")): if rec.id in names: if names[rec.id]: rec.seq = RC(rec.seq) rec.id += "RC" SeqIO.write(rec, contigs_handler, "fasta") contigs_handler.close() alignment_dir = os.path.join(sys.argv[4], "alignment") if not os.path.exists(alignment_dir): os.makedirs(alignment_dir) alignment = os.path.join(sys.argv[4], "alignment.sam") make_alignment(contigs_file, [sys.argv[3]], 8, alignment_dir, "pacbio", alignment)
cur = None dump = open(sys.argv[2]).readlines() d = dict() for s in dump: s = s.strip() if s == "": continue if s.startswith("#"): s = s[1:].split() if s[0] == "Repeat": repeat = s[1] if s[0] in ["All", "Input", "Output"]: cur = s[1] else: if repeat not in interest: continue sign = s[0] s = s[1:] if s not in d: d[s] = [] d[s].append((repeat, cur, sign)) for rec in SeqIO.parse_fasta(open(sys.argv[1])): id = rec.id.split()[0] if id in d: tmp = d[id] if ("reads", "-") in [(a[1], a[2]) for a in tmp]: rec.seq = RC(rec.seq) SeqIO.write( common.seq_records.SeqRecord(rec.seq, id + "_" + str(d[id])), sys.stdout, "fasta") sys.stderr.write(id + "_" + str(d[id]) + "\n")
import sys sys.path.append("py") from common import SeqIO if __name__ == "__main__": contigs = list(SeqIO.parse_fasta(open(sys.argv[1], "r"))) print "Total:", sum(map( len, contigs)), "nucleotides in", len(contigs), "contigs." for contig in contigs: print contig.id, len(contig)