Ejemplo n.º 1
0
def printSegs(f, segs):
    c = ContigStorage().loadFromFasta(open(f, "r"), False)
    for seg in segs:
        if seg[2] == 0:
            seg[2] = len(c[seg[0]])
        SeqIO.write(c[seg[0]].segment(seg[1], seg[2]).asContig(), sys.stdout,
                    "fasta")
Ejemplo n.º 2
0
 def printKnottedToFasta(self, handler):
     # type: (BinaryIO) -> None
     printed = set()
     cnt = 1
     for chain in self.chains():
         if chain[0].rc.id in printed:
             continue
         for line in chain:
             printed.add(line.id)
         seq = []
         id = []
         if chain[-1].knot is not None:
             id.append("Circular")
         for line in chain:
             id.append(line.id)
             if line.knot is not None:
                 id.append(str(line.knot.gap))
                 if line.knot.gap < 0:
                     seq.append(line.seq[:line.knot.gap])
                 else:
                     seq.append(line.seq)
                     seq.append(line.knot.gap_seq)
             else:
                 seq.append(line.seq)
         sys.stdout.trace(cnt, ":", ";".join(id))
         SeqIO.write(NamedSequence("".join(seq), "contig_" + str(cnt)),
                     handler, "fasta")
         cnt += 1
Ejemplo n.º 3
0
def simulate1(dir, mutation, genome):
    print "Simulating", genome
    ds = dataset_simulation.TestDataset(genome, 5000, mutation_rate=mutation)
    genome_seq = ds.mutate(ds.genome, mutation / 2)[0]
    f = open(os.path.join(dir, genome + str(mutation * 100) + ".fasta"), "w")
    SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta")
    f.close()
Ejemplo n.º 4
0
def prepare_disjointigs_file(disjointigs_file, disjointigs_file_list):
    recs = []
    for fn in disjointigs_file_list:
        for rec in SeqIO.parse_fasta(open(fn, "r")):
            recs.append(rec)
    h = open(disjointigs_file, "w")
    for rec in recs:
        SeqIO.write(rec, h, "fasta")
    h.close()
Ejemplo n.º 5
0
def collect_contigs(dataset, barcodes_dir, output_base, format):
    output = open(output_base + "." + format, "w")
    for barcode in dataset:
        file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format)
        if os.path.exists(file):
            contigs = SeqIO.parse(open(file), format)
            for contig in contigs:
                contig.id = barcode.id + "-" + contig.id
                SeqIO.write(contig, output, format)
    output.close()
Ejemplo n.º 6
0
def PrintResults(recs, reference, references_file, coordinates_file):
    aln = open(coordinates_file, "w")
    fasta = open(references_file, "w")
    for rec in recs:
        aln.write(str(rec) + "\n")
        sequence = reference[rec.rname][rec.left:rec.right]
        rec_id = str(rec.rname) + "_(" + str(rec.left) + "-" + str(rec.right)+")"
        SeqIO.write(SeqIO.SeqRecord(sequence, rec_id), fasta, "fasta")
    aln.close()
    fasta.close()
Ejemplo n.º 7
0
def simulate2(dir, mutation, error_rate, genome):
    print "Simulating", genome
    ds = dataset_simulation.TestDataset(genome, 4000, mutation_rate=mutation)
    genome_seq = ds.mutate(ds.genome, mutation / 2)[0]
    total = 0
    f = open(os.path.join(dir, genome + ".fasta"), "w")
    SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta")
    f.close()
    f = open(os.path.join(dir, "reads.fasta"), "w")
    cnt = 0
    while total < len(genome_seq) * 30:
        l = random.randint(3000, 3500)
        pos = random.randint(0, len(genome_seq) - l)
        seq = ds.mutate(genome_seq[pos:pos + l], error_rate)[0]
        SeqIO.write(NamedSequence(seq, str(cnt)), f, "fasta")
        cnt += 1
        total += len(seq)
    f.close()
Ejemplo n.º 8
0
def main(ref_file, contig_size, rlen, cov, dir):
    basic.ensure_dir_existance(dir)
    all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    contig_file_name = os.path.join(dir, "contigs.fasta")
    contig_file = open(contig_file_name, "w")
    reads_file_name = os.path.join(dir, "reads.fasta")
    reads_file = open(reads_file_name, "w")
    for ref in all_contigs.unique():
        if len(ref) < contig_size:
            continue
        SeqIO.write(ref, contig_file, "fasta")
        for i in range(0, len(ref), max(1, rlen / cov)):
            read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence()
            SeqIO.write(read, reads_file, "fasta")
    reads_file.close()
    contig_file.close()
    print "Done"
    print contig_file_name
    print reads_file_name
Ejemplo n.º 9
0
def main(k, dir, contigs_file, reads_file):
    # type: (int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        if len(read.alignments) > 1:
            SeqIO.write(read, res, "fasta")
    res.close()
Ejemplo n.º 10
0
import sys
import os

import common.seq_records

sys.path.append("py")
import common.SeqIO as SeqIO

read_len = int(sys.argv[2])
cov = float(sys.argv[3])
for seq in SeqIO.parse_fasta(open(sys.argv[1], "r")):
    sys.stderr.write(seq.id + " " + str(len(seq)) + " " + str(int(len(seq) * cov / read_len)) + "\n")
    # if len(seq) > 100000000 or len(seq) < 10000000:
    #      continue
    cur = 100000
    for i in range(0, len(seq), int(read_len / cov)):
        if i > cur:
            sys.stderr.write(str(cur) + "\n")
            cur = cur * 3 / 2
        SeqIO.write(common.seq_records.SeqRecord(seq.seq[i:min(len(seq), i + read_len)], seq.id + "_" + str(i)), sys.stdout, "fasta")
    break

Ejemplo n.º 11
0
 def writeToFasta(self, handler):
     for contig in self.unique():
         SeqIO.write(contig, handler, "fasta")
Ejemplo n.º 12
0
 def print_fasta(self, handler):
     # type: (BinaryIO) -> None
     SeqIO.write(self, handler, "fasta")
Ejemplo n.º 13
0
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length):
    params.technology = "nano"
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    print " ".join(sys.argv)
    print "Reading graph"
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv"))
    graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"),
                  True)
    print "Extracting relevant graph component"
    edge_ids = edge_id.split(",")
    to_resolve = to_resolve.split(",")
    to_resolve = [(a, int(b))
                  for a, b in zip(to_resolve[0::2], to_resolve[1::2])]
    unique = uniqueNeighbours(edge_ids, graph, min_contig_length)

    if rf == "none":
        return
    print "Finding reads that align to", edge_ids
    reads_to_resolve = dict()  # type: Dict[str, List[str]]
    for eid, mult in to_resolve:
        reads_to_resolve[eid] = []
    for unique_edge, initial in unique:
        reads_to_resolve[initial] = []
    relevant_read_ids = set()
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if eid in edge_ids:
            relevant_read_ids.add(rid)
            print rid, eid
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if rid in relevant_read_ids and eid in reads_to_resolve:
            reads_to_resolve[eid].append(rid)
    for eid in reads_to_resolve:
        reads_to_resolve[eid] = list(set(reads_to_resolve[eid]))
    print "Reading reads"
    res_reads = ContigStorage()
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_by_name(rf):
        if read.id in relevant_read_ids:
            res_reads.add(Contig(read.seq, read.id))
            SeqIO.write(read, res, "fasta")
    res.close()
    random_down = open(os.path.join(dir, "random_down.fasta"), "w")
    cnt = 0
    for read in res_reads:
        if cnt % 5 == 0:
            SeqIO.write(read, random_down, "fasta")
        cnt += 1
    random_down.close()
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    lcf = open(os.path.join(dir, "contigs.lc"), "w")
    for eid, mult in to_resolve:
        repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]]
        print reads_to_resolve[eid]
        print map(str, repeat_reads)
        split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult,
                                    repeat_reads, min_contig_length)
        if split_contigs is None:
            print "Failed to resove edge", eid, "Aborting"
        print "Edge", eid, "was split into", mult, "copies"
        for contig, contig_reads in split_contigs:
            print contig.id
            SeqIO.write(contig, res, "fasta")
            lcf.write(contig.id + "\n")
            lcf.write(" ".join([r.id for r in contig_reads]) + "\n")
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    for unique_edge, initial in unique:
        print unique_edge.id
        SeqIO.write(unique_edge, res, "fasta")
        lcf.write(unique_edge.id + "\n")
        lcf.write(" ".join(reads_to_resolve[initial]) + "\n")
    res.close()
Ejemplo n.º 14
0
def main(contigs_file, contig_name, reads_file, dir, k):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    contig = contigs[contig_name]
    contigs = ContigStorage()
    contigs.add(contig)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    als = list(aligner.localAlign(reads.unique(), contigs))
    tmp = []
    for al in als:
        if al.seg_to.contig != contig:
            al = al.rc
        tmp.append(al)
    als = tmp
    als = sorted(als,
                 key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right
                 - al.seg_to.left)
    counts = dict()
    for al in als:
        counts[al.seg_from.contig.id] = 0
    for al in als:
        if len(al) > k:
            counts[al.seg_from.contig.id] += 1
    w = 20
    f = open(os.path.join(dir, "reads.fasta"), "w")
    over = set()
    inter = set()
    for al in als:
        if len(al) < k:
            continue
        inter.add(basic.Normalize(al.seg_from.contig.id))
        if not al.contradictingRTC():
            over.add(basic.Normalize(al.seg_from.contig.id))
        m = al.matchingSequence(True)
        tmp = []
        for i in range(len(contig) / w + 1):
            tmp.append([])
        for a, b in m.matches:
            tmp[b / w].append((a, b))
        for i in range(len(contig) / w):
            if i + 1 < len(tmp) and len(tmp[i + 1]) > 0:
                tmp[i].append(tmp[i + 1][0])
        for i in range(len(contig) / w):
            seg = contig.segment(i * w, i * w + w)
            if al.seg_to.inter(seg):
                if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length:
                    sys.stdout.write("B")
                elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length:
                    sys.stdout.write("E")
                else:
                    if len(tmp[i]) == 0:
                        sys.stdout.write("*")
                    else:
                        a = tmp[i][-1][0] - tmp[i][0][0]
                        b = tmp[i][-1][1] - tmp[i][0][1]
                        if a - b > 30:
                            sys.stdout.write("I")
                        elif a - b > 15:
                            sys.stdout.write("i")
                        elif a - b < -30:
                            sys.stdout.write("D")
                        elif a - b < -15:
                            sys.stdout.write("d")
                        else:
                            sys.stdout.write(
                                str(min(8,
                                        max(a, b) + 1 - len(tmp[i]))))
            else:
                sys.stdout.write("*")
        print " ", al.seg_from.contig.id, counts[
            al.seg_from.contig.id], al.contradictingRTC()
    print inter
    for rid in inter:
        SeqIO.write(reads[rid], f, "fasta")
        print rid, reads[rid]
    f.close()
    f = open(os.path.join(dir, "reads_over.fasta"), "w")
    for rid in over:
        SeqIO.write(reads[rid], f, "fasta")
    f.close()
Ejemplo n.º 15
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"
Ejemplo n.º 16
0
 def printToFasta(self, handler):
     # type: (BinaryIO) -> None
     for line in UniqueList(self.items.values()):
         SeqIO.write(line, handler, "fasta")
Ejemplo n.º 17
0
def OutputResults(output_file, format, result):
    output = open(output_file + "." + format, "w")
    for contig in result:
        SeqIO.write(contig, output, format)
    output.close()
Ejemplo n.º 18
0
def main(args):
    flye_dir = sys.argv[1]
    repeats, starts, ends = parse(sys.argv[2])
    graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir(
        flye_dir)
    dump = os.path.join(rep_dir, "read_alignment_dump")
    reads_file = sys.argv[3]
    dir = sys.argv[4]
    CreateLog(dir)
    print " ".join(args)
    print "Printing contigs"
    edges_file = os.path.join(rep_dir, "graph_before_rr.fasta")
    edges = ContigStorage().loadFromFasta(open(edges_file, "r"))
    unique = open(os.path.join(dir, "contigs"), "w")
    for l in starts:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[-15000:]
        SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique,
                    "fasta")
    for l in ends:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[:15000]
        SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"),
                    unique, "fasta")
    unique.close()
    print "Selecting reads"
    reads = set()
    cur_read = None
    als = []
    for s in open(dump).readlines():
        if s.startswith("Chain"):
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print als
            for al in als:
                if al in repeats:
                    if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                        print "oppa"
                    reads.add(cur_read)
                    break
            als = []

        else:
            s = s.split()
            cur_read = s[2][1:]
            eid = s[6].split("_")[1]
            if s[6][0] == "-":
                eid = "-" + eid
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print eid
            als.append(eid)
    print "Selected", len(reads), "reads"
    print "\n".join(reads)
    print "Reading and printing reads"
    freads = open(os.path.join(dir, "reads.fasta"), "w")
    cnt = 0
    for read in SeqIO.parse_by_name(reads_file):
        cnt += 1
        if cnt % 10000 == 0:
            print cnt
        if read.id in reads:
            SeqIO.write(read, freads, "fasta")
    freads.close()
Ejemplo n.º 19
0
import sys
sys.path.append("py")

from common import SeqIO

rf = sys.argv[1]
outf = sys.argv[2]
total = int(sys.argv[3])
print "Reading reads"
reads = list(SeqIO.parse_fasta(open(rf, "r")))
print "Sorting reads"
reads = sorted(reads, key=lambda read: -len(read))
print "Printing reads"
out = open(outf, "w")
for read in reads:
    if total <= 0:
        break
    total -= len(read)
    SeqIO.write(read, out, "fasta")
out.close()
Ejemplo n.º 20
0
def main(argv):
    sys.stdout.write("Started\n")
    dot_file = argv[1]
    edge_sequences = argv[2]
    reference_file = argv[3]
    alignment_file = argv[4]
    edges = ParseVertices(argv[5])
    output_file = argv[6]
    sys.stdout.write("Loading dot\n")
    dot = DotParser(open(dot_file, "r")).parse()
    edge_collection = ContigCollection().loadFromFasta(
        open(edge_sequences, "r"), True)
    graph = Graph().loadFromDot(edge_collection, dot)
    vertices = [graph.E[id].start.id for id in edges]
    graph.printToFile(sys.stdout)
    print vertices
    ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False)

    print "Looking for relevant"
    pq = PriorityQueue()
    for v in graph.V.values():
        if v.id in vertices:
            pq.push((0, v))
    visited = []
    while not pq.empty():
        d, v = pq.pop()
        if v in visited:
            continue
        visited.append(v)
        for e in v.inc:
            print e.id, e.start.id, e.end.id
            if d + len(e) < dist:
                pq.push((d + len(e), e.start))
        for e in v.out:
            print e.id, e.start.id, e.end.id
            if d + len(e) < dist:
                pq.push((d + len(e), e.end))
    print "Visited", len(visited)
    print map(str, list(visited))
    relevant = []
    edge_alignments = ReadCollection().loadFromFasta(open(edge_sequences,
                                                          "r")).addAllRC()
    for edge in graph.E.values():
        if edge.start in visited or edge.start.rc in visited:
            relevant.append(edge_alignments[edge.id])
    print "Loading sam"
    edge_alignments.fillFromSam(Samfile(open(alignment_file, "r")), ref)
    for rel in relevant:
        print rel.__str__()
    print "Collecting segments"
    segments = []
    chr1 = ref["chr1"]
    for edge in relevant:
        for al in edge.alignments:
            print al
            if al.seg_from.inter(edge.prefix(dist)):
                l = dist - al.seg_from.left
                contig = al.seg_to.contig
                start = al.seg_to.left
                segments.append(
                    Segment(contig, start, min(start + l, len(contig))))
                print segments[-1]
    tmp = []
    print "Rotating"
    for seg in segments:
        if seg.contig != chr1:
            seg = seg.RC()
        if seg.contig != chr1:
            print "WARNING", seg
        tmp.append(seg)
    segments = sorted(tmp, key=lambda seg: seg.left)
    print "All relevant segments"
    print "\n".join(map(str, segments))
    cur_seg = None
    interesting_segments = []
    print "Gluing"
    for seg in segments:
        if cur_seg is None:
            cur_seg = seg.copy()
            continue
        if cur_seg.right + 20000 < seg.left:
            interesting_segments.append(cur_seg.copy())
            cur_seg = seg.copy()
        else:
            cur_seg.right = max(cur_seg.right, seg.right)
    if cur_seg is not None:
        interesting_segments.append(cur_seg.copy())

    alignments = []
    for edge in edge_alignments:
        for al in edge.alignments:
            ok = False
            for seg in interesting_segments:
                if al.seg_to.inter(seg):
                    alignments.append(al)
    alignments = sorted(alignments, key=lambda al: al.seg_to.left)
    print "All relevant alignments"
    print "\n".join(map(str, alignments))

    print "Interesting segments:", len(interesting_segments), sum(
        map(len, interesting_segments))
    for seg in interesting_segments:
        print seg
    f = open(output_file, "w")
    tmp = []
    for seg in interesting_segments:
        SeqIO.write(SeqIO.SeqRecord(seg.Seq(), seg.__str__()), f, "fasta")
        tmp.append(seg.Seq())
    f.close()
    f1 = open(output_file + "1", "w")
    SeqIO.write(SeqIO.SeqRecord(("N" * 20000).join(tmp), "concat"), f1,
                "fasta")
Ejemplo n.º 21
0
cur = None
dump = open(sys.argv[2]).readlines()
d = dict()
for s in dump:
    s = s.strip()
    if s == "":
        continue
    if s.startswith("#"):
        s = s[1:].split()
        if s[0] == "Repeat":
            repeat = s[1]
        if s[0] in ["All", "Input", "Output"]:
            cur = s[1]
    else:
        if repeat not in interest:
            continue
        sign = s[0]
        s = s[1:]
        if s not in d:
            d[s] = []
        d[s].append((repeat, cur, sign))
for rec in SeqIO.parse_fasta(open(sys.argv[1])):
    id = rec.id.split()[0]
    if id in d:
        tmp = d[id]
        if ("reads", "-") in [(a[1], a[2]) for a in tmp]:
            rec.seq = RC(rec.seq)
        SeqIO.write(
            common.seq_records.SeqRecord(rec.seq, id + "_" + str(d[id])),
            sys.stdout, "fasta")
        sys.stderr.write(id + "_" + str(d[id]) + "\n")
Ejemplo n.º 22
0
def main(flye_dir, rf, dir, edge_id, k):
    params.technology = "nano"
    params.k = k
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    print "Reading graph"
    graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa"))
    print "Parsing edge mapping"
    id_map = parseUPaths(flye_dir)
    edge_ids = edge_id.split(",")
    print "Extracting relevant graph component"
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    unique = dict()
    for eid in edge_ids:
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "l"] = NamedSequence(
                        basic.RC(e.seq[:5000]), e.id[1:] + "l")
                else:
                    unique[e.id + "r"] = NamedSequence(e.seq[-5000:],
                                                       e.id + "r")
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "r"] = NamedSequence(
                        basic.RC(e.seq[-5000:]), e.id[1:] + "r")
                else:
                    unique[e.id + "l"] = NamedSequence(e.seq[:5000],
                                                       e.id + "l")

    for c in unique.values():
        print c.id
        SeqIO.write(c, res, "fasta")
    res.close()
    old_ids = []
    for eid in edge_ids:
        for olde in id_map[eid[len("edge_"):]]:
            old_ids.append(basic.Normalize(olde))
    print "Finding reads that align to", edge_ids
    print "Old ids:", old_ids
    relevant_read_ids = set()
    for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"),
                  "r").readlines():
        s = s.split()
        if s[0] != "Aln":
            continue
        if s[6].split("_")[1] in old_ids:
            relevant_read_ids.add(s[2][1:])
            print s[2][1:], s[6].split("_")[1]
    print "Reading reads"
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_fasta(open(rf, "r")):
        if read.id in relevant_read_ids and len(read) > k * 1.2:
            SeqIO.write(read, res, "fasta")
    res.close()
Ejemplo n.º 23
0
 def OutputBroken(self, output_file):
     output = open(output_file, "w")
     for contig in self.contigs:
         for subcontig in self.Break(contig):
             SeqIO.write(subcontig, output, "fasta")
     output.close()
Ejemplo n.º 24
0
import sys

sys.path.append("py")
from common import basic, SeqIO
from common.SimpleGraph import SimpleGraph

graph = SimpleGraph().ReadGFA(sys.argv[1])
for e_id in graph.e:
    if basic.isCanonocal(e_id):
        SeqIO.write(graph.e[e_id], sys.stdout, "fasta")
Ejemplo n.º 25
0
import sys
import os
sys.path.append("py")
import common.SeqIO as SeqIO
from common.basic import RC

names = sys.argv[2:]
tmp = dict()
for s in names:
    s = s.split(",")
    tmp[s[0]] = s
names = tmp

for rec in SeqIO.parse_fasta(open(sys.argv[1], "r")):
    if rec.id in names:
        s = names[rec.id]
        if "RC" in s:
            rec.seq = RC(rec.seq)
        if "end" in s:
            rec.seq = rec.seq[-1000:]
        if "start" in s:
            rec.seq = rec.seq[:1000]
        SeqIO.write(rec, sys.stdout, "fasta")
Ejemplo n.º 26
0
tmp = dict()
for s in names:
    if s.endswith("RC"):
        tmp[s[:-2]] = True
    else:
        tmp[s] = False
names = tmp

contigs_file = os.path.join(sys.argv[4], "contigs.fasta")
contigs_handler = open(contigs_file, "w")
for rec in SeqIO.parse_fasta(open(sys.argv[2], "r")):
    if rec.id in names:
        if names[rec.id]:
            rec.seq = RC(rec.seq)
            rec.id += "RC"
        SeqIO.write(rec, contigs_handler, "fasta")
contigs_handler.close()

alignment_dir = os.path.join(sys.argv[4], "alignment")
if not os.path.exists(alignment_dir):
    os.makedirs(alignment_dir)

alignment = os.path.join(sys.argv[4], "alignment.sam")
make_alignment(contigs_file, [sys.argv[3]], 8, alignment_dir, "pacbio",
               alignment)

aligned = set()
for rec in sam_parser.Samfile(open(alignment, "r")):
    if rec.is_unmapped:
        continue
    aligned.add(rec.query_name)
Ejemplo n.º 27
0
 def WriteSequences(self, reads, reads_file):
     # type: (Iterable[NamedSequence], str) -> None
     f = open(reads_file, "w")
     for read in reads:
         SeqIO.write(read, f, "fasta")
     f.close()
Ejemplo n.º 28
0
    dir = sys.argv[1]
    extra_params = sys.argv[4:]
    CreateLog(dir)
    dd = DirDistributor(dir)
    aligner = Aligner(dd)
    polisher = Polisher(aligner, dd)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"),
                                          num_names=False)
    ref = ContigStorage().loadFromFasta(open(consensus_file, "r"),
                                        num_names=False)
    if "accurate" in extra_params:
        res = []
        als = sorted(aligner.overlapAlign(reads, ref),
                     key=lambda al: al.seg_to.contig.id)
        for rid, rals in itertools.groupby(als,
                                           key=lambda al: al.seg_to.contig.id):
            if basic.isCanonocal(rid):
                contig = ref[rid]
                corrected_seq = polisher.polishSegment(
                    contig.asSegment(), list(rals)).seg_from.Seq()
                res.append(Contig(corrected_seq, rid))
    else:
        res = polisher.polishMany(reads, list(ref.unique()))
    res_file = os.path.join(dir, "res.fasta")
    rf = open(res_file, "w")
    for c in res:
        SeqIO.write(c, rf, "fasta")
    rf.close()
    aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap",
                        os.path.join(dir, "res.sam"))
Ejemplo n.º 29
0
 def PrintFasta(self, out):
     for e in self.e.values():
         SeqIO.write(e, out, "fasta")