Exemple #1
0
 def printKnottedToFasta(self, handler):
     # type: (BinaryIO) -> None
     printed = set()
     cnt = 1
     for chain in self.chains():
         if chain[0].rc.id in printed:
             continue
         for line in chain:
             printed.add(line.id)
         seq = []
         id = []
         if chain[-1].knot is not None:
             id.append("Circular")
         for line in chain:
             id.append(line.id)
             if line.knot is not None:
                 id.append(str(line.knot.gap))
                 if line.knot.gap < 0:
                     seq.append(line.seq[:line.knot.gap])
                 else:
                     seq.append(line.seq)
                     seq.append(line.knot.gap_seq)
             else:
                 seq.append(line.seq)
         sys.stdout.trace(cnt, ":", ";".join(id))
         SeqIO.write(NamedSequence("".join(seq), "contig_" + str(cnt)),
                     handler, "fasta")
         cnt += 1
 def addRead(self, read_seq):
     name = "R" + str(len(self.reads)) + "_" + read_seq
     self.reads.append(
         NamedSequence(
             self.mutate(self.translate(read_seq), self.error_rate)[0],
             name))
     return name
def simulate1(dir, mutation, genome):
    print "Simulating", genome
    ds = dataset_simulation.TestDataset(genome, 5000, mutation_rate=mutation)
    genome_seq = ds.mutate(ds.genome, mutation / 2)[0]
    f = open(os.path.join(dir, genome + str(mutation * 100) + ".fasta"), "w")
    SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta")
    f.close()
 def addDisjointig(self, disjointig_seq):
     # type: (str) -> str
     self.disjointigs.append(
         NamedSequence(
             self.mutate(self.translate(disjointig_seq),
                         self.mutation_rate)[0],
             "D" + str(len(self.disjointigs)) + "_" + disjointig_seq))
     return self.disjointigs[-1].id
def simulate2(dir, mutation, error_rate, genome):
    print "Simulating", genome
    ds = dataset_simulation.TestDataset(genome, 4000, mutation_rate=mutation)
    genome_seq = ds.mutate(ds.genome, mutation / 2)[0]
    total = 0
    f = open(os.path.join(dir, genome + ".fasta"), "w")
    SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta")
    f.close()
    f = open(os.path.join(dir, "reads.fasta"), "w")
    cnt = 0
    while total < len(genome_seq) * 30:
        l = random.randint(3000, 3500)
        pos = random.randint(0, len(genome_seq) - l)
        seq = ds.mutate(genome_seq[pos:pos + l], error_rate)[0]
        SeqIO.write(NamedSequence(seq, str(cnt)), f, "fasta")
        cnt += 1
        total += len(seq)
    f.close()
Exemple #6
0
def uniqueNeighbours(edge_ids, graph, min_contig_length):
    unique = []
    for eid in edge_ids:
        print "Finding neighbours of", eid
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            id = basic.Normalize(e.id)
            if len(e.seq) < min_contig_length + params.bad_end_length:
                seq = uniquePathForward(
                    graph, e, min_contig_length + params.bad_end_length)
                id = id + "p"
                # seq = e.seq
            else:
                seq = e.seq[-min_contig_length - params.bad_end_length:]
                if e.id.startswith("-"):
                    id = id + "l"
                else:
                    id = id + "r"
            if e.id.startswith("-"):
                seq = basic.RC(seq)
            print "Right neighbour", eid, id
            unique.append((NamedSequence(seq, id), basic.Normalize(e.id)))
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            id = basic.Normalize(e.id)
            if len(e.seq) < min_contig_length + params.bad_end_length:
                seq = uniquePathBackward(
                    graph, e, min_contig_length + params.bad_end_length)
                id = id + "p"
                # seq = e.seq
            else:
                seq = e.seq[:min_contig_length + params.bad_end_length]
                if e.id.startswith("-"):
                    id = id + "r"
                else:
                    id = id + "l"
            if e.id.startswith("-"):
                seq = basic.RC(seq)
            print "Left neighbour", eid, id
            unique.append((NamedSequence(seq, id), basic.Normalize(e.id)))
    return unique
Exemple #7
0
 def polishSmallSegment(self, seg, als):
     # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece
     ok = False
     for al in als:
         if al.seg_to.contains(seg):
             ok = True
     if not ok:
         sys.stdout.log(common.log_params.LogPriority.warning, "Warning",
                        seg, "has no covering reads")
         return AlignmentPiece.Identical(seg.asContig().asSegment(), seg)
     reads = []
     start = basic.randomSequence(200)
     end = basic.randomSequence(200)
     for al in als:
         new_seq = ""
         al = al.reduce(target=seg)
         if al.seg_to.left < seg.left + 20:
             new_seq += start
         new_seq += al.seg_from.Seq()
         if al.seg_to.right > seg.right - 20:
             new_seq += end
         reads.append(NamedSequence(new_seq, al.seg_from.contig.id))
     base = Contig(start + seg.Seq() + end, "base")
     polished = None
     try:
         polished = Contig(self.polish(reads, base), "polished")
     except PolishException:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "has a sequence very different from reads. Using reads to correct."
         )
         for al, read in zip(als, reads):
             if al.seg_to.contains(seg):
                 try:
                     polished = Contig(
                         self.polish(reads, Contig(read.seq, read.id)),
                         "polished")
                     break
                 except PolishException:
                     pass
     if polished is None:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "could not be corrected even though some reads cover it.")
         polished = seg.asContig()
     als = list(self.aligner.overlapAlign([polished],
                                          ContigStorage([base])))
     for al in als:
         if al.seg_from.left < 10 and al.rc.seg_from.left < 10:
             mapping = AlignmentPiece.Identical(
                 base.segment(len(start),
                              len(base) - len(end)), seg)
             return al.compose(mapping)
     assert False, "No alignment from polished to base: " + str(als)
Exemple #8
0
def main(flye_dir, rf, dir, edge_id, k):
    params.technology = "nano"
    params.k = k
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    print "Reading graph"
    graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa"))
    print "Parsing edge mapping"
    id_map = parseUPaths(flye_dir)
    edge_ids = edge_id.split(",")
    print "Extracting relevant graph component"
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    unique = dict()
    for eid in edge_ids:
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "l"] = NamedSequence(
                        basic.RC(e.seq[:5000]), e.id[1:] + "l")
                else:
                    unique[e.id + "r"] = NamedSequence(e.seq[-5000:],
                                                       e.id + "r")
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "r"] = NamedSequence(
                        basic.RC(e.seq[-5000:]), e.id[1:] + "r")
                else:
                    unique[e.id + "l"] = NamedSequence(e.seq[:5000],
                                                       e.id + "l")

    for c in unique.values():
        print c.id
        SeqIO.write(c, res, "fasta")
    res.close()
    old_ids = []
    for eid in edge_ids:
        for olde in id_map[eid[len("edge_"):]]:
            old_ids.append(basic.Normalize(olde))
    print "Finding reads that align to", edge_ids
    print "Old ids:", old_ids
    relevant_read_ids = set()
    for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"),
                  "r").readlines():
        s = s.split()
        if s[0] != "Aln":
            continue
        if s[6].split("_")[1] in old_ids:
            relevant_read_ids.add(s[2][1:])
            print s[2][1:], s[6].split("_")[1]
    print "Reading reads"
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_fasta(open(rf, "r")):
        if read.id in relevant_read_ids and len(read) > k * 1.2:
            SeqIO.write(read, res, "fasta")
    res.close()
Exemple #9
0
 def asNamedSequence(self):
     # type: () -> NamedSequence
     return NamedSequence(
         self.Seq(), self.contig.id + "[" + str(self.left) + "," +
         str(self.right) + "]")
Exemple #10
0
 def __init__(self, seq, id, rc=None):
     # type: (str, str, Optional[Contig]) -> None
     NamedSequence.__init__(self, seq, id)
     if rc is None:
         rc = Contig(basic.RC(seq), basic.Reverse(id), self)
     self.rc = rc
 def addContig(self, contig_seq):
     # type: (str) -> str
     name = "C" + str(len(self.contigs)) + "_" + contig_seq
     self.contigs.append(NamedSequence(self.translate(contig_seq), name))
     return name
def main(args):
    flye_dir = sys.argv[1]
    repeats, starts, ends = parse(sys.argv[2])
    graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir(
        flye_dir)
    dump = os.path.join(rep_dir, "read_alignment_dump")
    reads_file = sys.argv[3]
    dir = sys.argv[4]
    CreateLog(dir)
    print " ".join(args)
    print "Printing contigs"
    edges_file = os.path.join(rep_dir, "graph_before_rr.fasta")
    edges = ContigStorage().loadFromFasta(open(edges_file, "r"))
    unique = open(os.path.join(dir, "contigs"), "w")
    for l in starts:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[-15000:]
        SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique,
                    "fasta")
    for l in ends:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[:15000]
        SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"),
                    unique, "fasta")
    unique.close()
    print "Selecting reads"
    reads = set()
    cur_read = None
    als = []
    for s in open(dump).readlines():
        if s.startswith("Chain"):
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print als
            for al in als:
                if al in repeats:
                    if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                        print "oppa"
                    reads.add(cur_read)
                    break
            als = []

        else:
            s = s.split()
            cur_read = s[2][1:]
            eid = s[6].split("_")[1]
            if s[6][0] == "-":
                eid = "-" + eid
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print eid
            als.append(eid)
    print "Selected", len(reads), "reads"
    print "\n".join(reads)
    print "Reading and printing reads"
    freads = open(os.path.join(dir, "reads.fasta"), "w")
    cnt = 0
    for read in SeqIO.parse_by_name(reads_file):
        cnt += 1
        if cnt % 10000 == 0:
            print cnt
        if read.id in reads:
            SeqIO.write(read, freads, "fasta")
    freads.close()