def printKnottedToFasta(self, handler): # type: (BinaryIO) -> None printed = set() cnt = 1 for chain in self.chains(): if chain[0].rc.id in printed: continue for line in chain: printed.add(line.id) seq = [] id = [] if chain[-1].knot is not None: id.append("Circular") for line in chain: id.append(line.id) if line.knot is not None: id.append(str(line.knot.gap)) if line.knot.gap < 0: seq.append(line.seq[:line.knot.gap]) else: seq.append(line.seq) seq.append(line.knot.gap_seq) else: seq.append(line.seq) sys.stdout.trace(cnt, ":", ";".join(id)) SeqIO.write(NamedSequence("".join(seq), "contig_" + str(cnt)), handler, "fasta") cnt += 1
def addRead(self, read_seq): name = "R" + str(len(self.reads)) + "_" + read_seq self.reads.append( NamedSequence( self.mutate(self.translate(read_seq), self.error_rate)[0], name)) return name
def simulate1(dir, mutation, genome): print "Simulating", genome ds = dataset_simulation.TestDataset(genome, 5000, mutation_rate=mutation) genome_seq = ds.mutate(ds.genome, mutation / 2)[0] f = open(os.path.join(dir, genome + str(mutation * 100) + ".fasta"), "w") SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta") f.close()
def addDisjointig(self, disjointig_seq): # type: (str) -> str self.disjointigs.append( NamedSequence( self.mutate(self.translate(disjointig_seq), self.mutation_rate)[0], "D" + str(len(self.disjointigs)) + "_" + disjointig_seq)) return self.disjointigs[-1].id
def simulate2(dir, mutation, error_rate, genome): print "Simulating", genome ds = dataset_simulation.TestDataset(genome, 4000, mutation_rate=mutation) genome_seq = ds.mutate(ds.genome, mutation / 2)[0] total = 0 f = open(os.path.join(dir, genome + ".fasta"), "w") SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta") f.close() f = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 while total < len(genome_seq) * 30: l = random.randint(3000, 3500) pos = random.randint(0, len(genome_seq) - l) seq = ds.mutate(genome_seq[pos:pos + l], error_rate)[0] SeqIO.write(NamedSequence(seq, str(cnt)), f, "fasta") cnt += 1 total += len(seq) f.close()
def uniqueNeighbours(edge_ids, graph, min_contig_length): unique = [] for eid in edge_ids: print "Finding neighbours of", eid for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue id = basic.Normalize(e.id) if len(e.seq) < min_contig_length + params.bad_end_length: seq = uniquePathForward( graph, e, min_contig_length + params.bad_end_length) id = id + "p" # seq = e.seq else: seq = e.seq[-min_contig_length - params.bad_end_length:] if e.id.startswith("-"): id = id + "l" else: id = id + "r" if e.id.startswith("-"): seq = basic.RC(seq) print "Right neighbour", eid, id unique.append((NamedSequence(seq, id), basic.Normalize(e.id))) for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue id = basic.Normalize(e.id) if len(e.seq) < min_contig_length + params.bad_end_length: seq = uniquePathBackward( graph, e, min_contig_length + params.bad_end_length) id = id + "p" # seq = e.seq else: seq = e.seq[:min_contig_length + params.bad_end_length] if e.id.startswith("-"): id = id + "r" else: id = id + "l" if e.id.startswith("-"): seq = basic.RC(seq) print "Left neighbour", eid, id unique.append((NamedSequence(seq, id), basic.Normalize(e.id))) return unique
def polishSmallSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece ok = False for al in als: if al.seg_to.contains(seg): ok = True if not ok: sys.stdout.log(common.log_params.LogPriority.warning, "Warning", seg, "has no covering reads") return AlignmentPiece.Identical(seg.asContig().asSegment(), seg) reads = [] start = basic.randomSequence(200) end = basic.randomSequence(200) for al in als: new_seq = "" al = al.reduce(target=seg) if al.seg_to.left < seg.left + 20: new_seq += start new_seq += al.seg_from.Seq() if al.seg_to.right > seg.right - 20: new_seq += end reads.append(NamedSequence(new_seq, al.seg_from.contig.id)) base = Contig(start + seg.Seq() + end, "base") polished = None try: polished = Contig(self.polish(reads, base), "polished") except PolishException: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "has a sequence very different from reads. Using reads to correct." ) for al, read in zip(als, reads): if al.seg_to.contains(seg): try: polished = Contig( self.polish(reads, Contig(read.seq, read.id)), "polished") break except PolishException: pass if polished is None: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "could not be corrected even though some reads cover it.") polished = seg.asContig() als = list(self.aligner.overlapAlign([polished], ContigStorage([base]))) for al in als: if al.seg_from.left < 10 and al.rc.seg_from.left < 10: mapping = AlignmentPiece.Identical( base.segment(len(start), len(base) - len(end)), seg) return al.compose(mapping) assert False, "No alignment from polished to base: " + str(als)
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
def asNamedSequence(self): # type: () -> NamedSequence return NamedSequence( self.Seq(), self.contig.id + "[" + str(self.left) + "," + str(self.right) + "]")
def __init__(self, seq, id, rc=None): # type: (str, str, Optional[Contig]) -> None NamedSequence.__init__(self, seq, id) if rc is None: rc = Contig(basic.RC(seq), basic.Reverse(id), self) self.rc = rc
def addContig(self, contig_seq): # type: (str) -> str name = "C" + str(len(self.contigs)) + "_" + contig_seq self.contigs.append(NamedSequence(self.translate(contig_seq), name)) return name
def main(args): flye_dir = sys.argv[1] repeats, starts, ends = parse(sys.argv[2]) graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir( flye_dir) dump = os.path.join(rep_dir, "read_alignment_dump") reads_file = sys.argv[3] dir = sys.argv[4] CreateLog(dir) print " ".join(args) print "Printing contigs" edges_file = os.path.join(rep_dir, "graph_before_rr.fasta") edges = ContigStorage().loadFromFasta(open(edges_file, "r")) unique = open(os.path.join(dir, "contigs"), "w") for l in starts: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[-15000:] SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique, "fasta") for l in ends: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[:15000] SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"), unique, "fasta") unique.close() print "Selecting reads" reads = set() cur_read = None als = [] for s in open(dump).readlines(): if s.startswith("Chain"): if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print als for al in als: if al in repeats: if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print "oppa" reads.add(cur_read) break als = [] else: s = s.split() cur_read = s[2][1:] eid = s[6].split("_")[1] if s[6][0] == "-": eid = "-" + eid if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print eid als.append(eid) print "Selected", len(reads), "reads" print "\n".join(reads) print "Reading and printing reads" freads = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 for read in SeqIO.parse_by_name(reads_file): cnt += 1 if cnt % 10000 == 0: print cnt if read.id in reads: SeqIO.write(read, freads, "fasta") freads.close()