예제 #1
0
def main(args):
    rf = args[2]
    dir = args[3]
    CreateLog(dir)
    disjointigs = ContigCollection().loadFromFasta(open(args[1], "r"), num_names=False)
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    clen = 5000000
    reads = ReadCollection().loadFromFasta(open(rf, "r"))
    tlen0 = sum(map(len, reads))
    for i in range(10):
        good_reads = set()
        for al in aligner.localAlign(reads, disjointigs):
            if not al.contradictingRTC(al.seg_to.contig.asSegment(), 500):
                good_reads.add(al.seg_from.contig.id)
        rf = os.path.join(dir, "reads" + str(i) + ".fasta")
        reads = reads.filter(lambda read: read.id not in good_reads).cleanCopy()
        tlen = sum(map(len, reads))
        reads.print_fasta(open(rf, "w"))
        l = tlen * clen / tlen0
        assembly_dir = os.path.join(dir, "assembly" + str(i))
        subprocess.check_call(["./bin/flye", "-o", assembly_dir, "-t", "8", "--pacbio-raw", rf, "--genome-size", str(l), "--no-trestle"])
        df= os.path.join(assembly_dir, "10-consensus", "consensus.fasta")
        disjointigs.addAll(ContigCollection().loadFromFasta(open(df, "r"), num_names=False))
        df = os.path.join(dir, "df" + str(i) + ".fasta")
        disjointigs.print_fasta(open(df, "w"))
예제 #2
0
def main(contig_file, reads_file, sam_file, dir, contig_id):
    # type: (str, str, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    contigs = ContigCollection()
    contigs.loadFromFasta(open(contig_file, "r"))
    print "Contigs loaded"
    contig = contigs[contig_id]
    read_names = set()
    for rec in Samfile(open(sam_file, "r")):
        read_names.add(rec.query_name)
    reads = ReadCollection()
    cnt = 0
    for rec in SeqIO.parse_fasta(open(reads_file, "r")):
        if rec.id in read_names:
            rec.id = "Read" + str(cnt)
            reads.add(AlignedRead(rec))
            cnt += 1
    reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w"))
    print "Reads loaded", len(reads)
    reads.addAllRC()
    print "RC added", len(reads)

    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    aligner.alignReadCollection(reads, contigs)
    print "Reads aligned", len(reads)
    reads = reads.inter(contig.asSegment())
    print "Reads filtered", len(reads)
    sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left)
    for read in sorted_reads:
        print read
        for al in read.alignmentsTo(contig.asSegment()):
            print "\n".join(al.asMatchingStrings())
예제 #3
0
 def nextDir(self):
     # type: () -> str
     name = os.path.join(self.dir, str(self.cur_dir))
     if params.save_alignments:
         self.cur_dir += 1
     assert self.cur_dir <= 100000
     basic.ensure_dir_existance(name)
     return name
예제 #4
0
 def __init__(self, dir, lines, dot_plot, reads, aligner):
     # type: (str, NewLineStorage, LineDotPlot, ReadCollection, Aligner) -> None
     self.dir = dir
     basic.ensure_dir_existance(dir)
     self.lines = lines
     self.dot_plot = dot_plot
     self.reads = reads
     self.aligner = aligner
     self.cnt = 0
예제 #5
0
def constructDisjointigs(reads, total_length, dir):
    # type: (ReadCollection, int, str) -> str
    basic.ensure_dir_existance(dir)
    reads_file = os.path.join(dir, "reads.fasta")
    disjointigs_file = os.path.join(dir, "disjointigs.fasta")
    log_file = os.path.join(dir, "log.txt")
    reads.print_fasta(open(reads_file, "w"))
    subprocess.check_call([os.path.join(params.bin_path, "flye-modules"), "assemble", "--reads", reads_file, "--out-asm", disjointigs_file, "--genome-size", str(total_length),
                           "--config", "flye/config/bin_cfg/asm_raw_reads.cfg", "--min-ovlp", "1500", "--threads", str(params.threads), "--log", log_file])
    return disjointigs_file
예제 #6
0
def CreateLog(dir):
    old_logs_dir = os.path.join(dir, "old")
    basic.ensure_dir_existance(old_logs_dir)
    log_file = os.path.join(dir, "log.info")
    if os.path.isfile(log_file):
        num = len(os.listdir(old_logs_dir))
        shutil.copy(log_file, os.path.join(old_logs_dir, str(num) + ".log"))
    log = open(log_file, "w")
    sys.stdout = basic.OStreamWrapper(sys.stdout, log)
    sys.stdout.prefix = lambda s: time.strftime("%I:%M:%S") + "  "
    sys.stderr = sys.stdout
예제 #7
0
 def __init__(self, dir, clean=False):
     # type: (str, bool) -> None
     self.dir = dir
     if clean:
         basic.recreate(self.dir)
     else:
         basic.ensure_dir_existance(self.dir)
     self.cnt = 0
     for name in os.listdir(self.dir):
         num = basic.parseNumber(name)
         if num is not None and num >= self.cnt:
             self.cnt = num + 1
예제 #8
0
 def dump(self, dirname):
     basic.ensure_dir_existance(dirname)
     edge_file = os.path.join(dirname, "edges.txt")
     stats_file = os.path.join(dirname, "stats.txt")
     init_file = os.path.join(dirname, "init.txt")
     reads_file = os.path.join(dirname, "reads.txt")
     contigs_file = os.path.join(dirname, "contigs.fasta")
     self.printStats(stats_file)
     self.printEdges(edge_file)
     self.printInit(init_file)
     self.printReads(reads_file)
     self.printContigs(contigs_file)
예제 #9
0
def main(args):
    dir = args[4]
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    sys.stdout.info("Starting graph-free recruitment")
    print " ".join(args)
    sys.stdout.info("Loading repeat sequences")
    seqs = ContigStorage().loadFromFasta(open(args[1], "r"), False)
    sys.stdout.info("Loading reads")
    reads = ContigStorage().loadFromFasta(open(args[2], "r"), False)
    k = int(args[3])
    recruit(seqs, reads, k, dir)
    sys.stdout.info("Finised graph-free recruitment")
예제 #10
0
def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
예제 #11
0
 def polish(self, reads, consensus):
     # type: (Iterable[NamedSequence], Contig) -> str
     dir, new_files, same = self.dir_distributor.fillNextDir([([consensus], "ref.fasta"), (reads, "reads.fasta")])
     consensus_file_name = new_files[0]
     reads_file_name = new_files[1]
     args = FakePolishingArgs()
     basic.ensure_dir_existance(os.path.join(dir, "work"))
     job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish")
     polished_file = job.out_files["contigs"]
     if same and not params.clean and os.path.exists(polished_file):
         sys.stdout.trace("Polishing reused:", polished_file)
     else:
         sys.stdout.trace("Running polishing:", polished_file)
         job.run()
     return list(SeqIO.parse_fasta(open(polished_file, "r")))[0].seq
예제 #12
0
 def polishMany(self, reads, sequences):
     # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig]
     dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")])
     consensus_file_name = new_files[0]
     reads_file_name = new_files[1]
     args = FakePolishingArgs()
     basic.ensure_dir_existance(os.path.join(dir, "work"))
     job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish")
     polished_file = job.out_files["contigs"]
     if same and not params.clean and os.path.exists(polished_file):
         sys.stdout.trace("Polishing reused:", polished_file)
     else:
         sys.stdout.trace("Running polishing:", polished_file)
         job.run()
     return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
예제 #13
0
 def align(self, reads, reference, mode):
     # type: (Iterable[NamedSequence], Iterable[Contig], str) -> sam_parser.Samfile
     reference = list(reference)
     dir, new_files, same = self.dir_distributor.fillNextDir([(reference, "contigs.fasta"), (list(reads), "reads.fasta")])
     contigs_file = new_files[0]
     reads_file = new_files[1]
     alignment_dir = os.path.join(dir, "alignment")
     alignment_file = os.path.join(dir, "alignment.sam")
     basic.ensure_dir_existance(dir)
     basic.ensure_dir_existance(alignment_dir)
     if same and not params.clean and os.path.exists(alignment_file):
         sys.stdout.log(common.log_params.LogPriority.alignment_files, "Alignment reused:", alignment_file)
         pass
     else:
         if os.path.isfile(alignment_file):
             os.remove(alignment_file)
         self.align_files(contigs_file, [reads_file], self.threads, params.technology, mode, alignment_file)
     return sam_parser.Samfile(open(alignment_file, "r"))
예제 #14
0
def extractSubgraph(dir, flye_dir, contigs):
    basic.ensure_dir_existance(dir)
    d = parseUPaths(flye_dir)
    edge_ids = []
    for contig in contigs:
        for s in d[contig]:
            edge_ids.append(s)
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_after_rr.gv"))
    vertex_ids = set()
    len = 0
    for eid in edge_ids:
        len += graph.e[eid].len
        vertex_ids.add(graph.e[eid].start)
        vertex_ids.add(graph.e[eid].end)
        if len > 10000:
            break
    # print "{|}|" + "|".join(["id " + r + "\\\\" for r in edge_ids])
    print "{|}|" + "|".join(["\"" + str(r) + "\"" for r in vertex_ids])
예제 #15
0
def main(ref_file, contig_size, rlen, cov, dir):
    basic.ensure_dir_existance(dir)
    all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    contig_file_name = os.path.join(dir, "contigs.fasta")
    contig_file = open(contig_file_name, "w")
    reads_file_name = os.path.join(dir, "reads.fasta")
    reads_file = open(reads_file_name, "w")
    for ref in all_contigs.unique():
        if len(ref) < contig_size:
            continue
        SeqIO.write(ref, contig_file, "fasta")
        for i in range(0, len(ref), max(1, rlen / cov)):
            read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence()
            SeqIO.write(read, reads_file, "fasta")
    reads_file.close()
    contig_file.close()
    print "Done"
    print contig_file_name
    print reads_file_name
예제 #16
0
def evaluatePI(dir, contigs_file, initial_file, ref_file):
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False)
    ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False)
    segs = []
    for al in aligner.overlapAlign(initial.unique(), contigs):
        if basic.isCanonocal(al.seg_to.contig.id):
            segs.append(al.seg_to)
        else:
            segs.append(al.rc.seg_to)
    segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id))
    interesting = dict()
    print "Interesting segments:"
    for contig in contigs:
        interesting[contig.id] = [contig.asSegment()]
    for contig, segit in itertools.groupby(segs, lambda seg: seg.contig):
        csegs = SegmentStorage().addAll(segit)
        csegs.mergeSegments()
        csegs = csegs.reverse(contig)
        interesting[contig.id] = list(csegs)
        print list(csegs)
    print "Analysis of contigs"
    scorer = Scorer()
    for al in aligner.localAlign(contigs.unique(), ref):
        print al
        for seg in interesting[al.seg_from.contig.id]:
            if al.seg_from.expand(500).contains(
                    seg) or al.seg_from.interSize(seg) > 40000:
                tmp_al = al.reduce(query=al.seg_from.cap(seg))
                scorer.polyshMatching(tmp_al.matchingSequence(),
                                      params.score_counting_radius)
                print tmp_al.seg_from, tmp_al.seg_to, str(events)
    print ""
    print "Analysis of initial"
    for al in aligner.overlapAlign(initial, ref):
        scorer.polyshMatching(al.matchingSequence(),
                              params.score_counting_radius)
        print al.seg_from, al.seg_to, str(events)
예제 #17
0
def main(ref_file, segment, dir):
    ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    chr1 = ref["chr1"]
    if segment[0] < 0:
        segment = (-segment[0], -segment[1])
        chr1 = chr1.rc
    reads = ReadCollection()
    reads_list = []
    for i in range(segment[0], segment[1], 500):
        read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence())
        reads_list.append(read)
    chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:]
    chr1.rc.seq = basic.RC(chr1.seq)
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, ref)
    out = sys.stdout
    for read in reads_list:
        # print read
        out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n")
    out.close()
예제 #18
0
def main(k, dir, contigs_file, reads_file):
    # type: (int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        if len(read.alignments) > 1:
            SeqIO.write(read, res, "fasta")
    res.close()
예제 #19
0
def main(flye_dir, rf, dir, edge_id, k):
    params.technology = "nano"
    params.k = k
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    print "Reading graph"
    graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa"))
    print "Parsing edge mapping"
    id_map = parseUPaths(flye_dir)
    edge_ids = edge_id.split(",")
    print "Extracting relevant graph component"
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    unique = dict()
    for eid in edge_ids:
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "l"] = NamedSequence(
                        basic.RC(e.seq[:5000]), e.id[1:] + "l")
                else:
                    unique[e.id + "r"] = NamedSequence(e.seq[-5000:],
                                                       e.id + "r")
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "r"] = NamedSequence(
                        basic.RC(e.seq[-5000:]), e.id[1:] + "r")
                else:
                    unique[e.id + "l"] = NamedSequence(e.seq[:5000],
                                                       e.id + "l")

    for c in unique.values():
        print c.id
        SeqIO.write(c, res, "fasta")
    res.close()
    old_ids = []
    for eid in edge_ids:
        for olde in id_map[eid[len("edge_"):]]:
            old_ids.append(basic.Normalize(olde))
    print "Finding reads that align to", edge_ids
    print "Old ids:", old_ids
    relevant_read_ids = set()
    for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"),
                  "r").readlines():
        s = s.split()
        if s[0] != "Aln":
            continue
        if s[6].split("_")[1] in old_ids:
            relevant_read_ids.add(s[2][1:])
            print s[2][1:], s[6].split("_")[1]
    print "Reading reads"
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_fasta(open(rf, "r")):
        if read.id in relevant_read_ids and len(read) > k * 1.2:
            SeqIO.write(read, res, "fasta")
    res.close()
예제 #20
0
import sys

import os

sys.path.append("py")
from common.SimpleGraph import SimpleGraph
from common import basic

g = SimpleGraph()
g.ReadDot(sys.argv[1])
basic.ensure_dir_existance(sys.argv[2])
args = sys.argv[3:]
if "merge" in args:
    g = g.Merge()
cnt = 0
oppa = []
for comp in g.Split(1000000000):
    if len(comp) < 3:
        if len(g.v[comp[0]].inc) + len(g.v[comp[0]].out) + len(
                g.v[comp[-1]].inc) + len(g.v[comp[-1]].out) <= 2:
            pass
        else:
            oppa.extend(comp)
        if len(oppa) > 30:
            comp = list(oppa)
            oppa = []
        else:
            continue
    print cnt, len(comp)
    f = open(os.path.join(sys.argv[2], str(cnt) + ".dot"), "w")
    g.Draw(comp, f)
예제 #21
0
def main(contigs_file, contig_name, reads_file, dir, k):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    contig = contigs[contig_name]
    contigs = ContigStorage()
    contigs.add(contig)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    als = list(aligner.localAlign(reads.unique(), contigs))
    tmp = []
    for al in als:
        if al.seg_to.contig != contig:
            al = al.rc
        tmp.append(al)
    als = tmp
    als = sorted(als,
                 key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right
                 - al.seg_to.left)
    counts = dict()
    for al in als:
        counts[al.seg_from.contig.id] = 0
    for al in als:
        if len(al) > k:
            counts[al.seg_from.contig.id] += 1
    w = 20
    f = open(os.path.join(dir, "reads.fasta"), "w")
    over = set()
    inter = set()
    for al in als:
        if len(al) < k:
            continue
        inter.add(basic.Normalize(al.seg_from.contig.id))
        if not al.contradictingRTC():
            over.add(basic.Normalize(al.seg_from.contig.id))
        m = al.matchingSequence(True)
        tmp = []
        for i in range(len(contig) / w + 1):
            tmp.append([])
        for a, b in m.matches:
            tmp[b / w].append((a, b))
        for i in range(len(contig) / w):
            if i + 1 < len(tmp) and len(tmp[i + 1]) > 0:
                tmp[i].append(tmp[i + 1][0])
        for i in range(len(contig) / w):
            seg = contig.segment(i * w, i * w + w)
            if al.seg_to.inter(seg):
                if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length:
                    sys.stdout.write("B")
                elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length:
                    sys.stdout.write("E")
                else:
                    if len(tmp[i]) == 0:
                        sys.stdout.write("*")
                    else:
                        a = tmp[i][-1][0] - tmp[i][0][0]
                        b = tmp[i][-1][1] - tmp[i][0][1]
                        if a - b > 30:
                            sys.stdout.write("I")
                        elif a - b > 15:
                            sys.stdout.write("i")
                        elif a - b < -30:
                            sys.stdout.write("D")
                        elif a - b < -15:
                            sys.stdout.write("d")
                        else:
                            sys.stdout.write(
                                str(min(8,
                                        max(a, b) + 1 - len(tmp[i]))))
            else:
                sys.stdout.write("*")
        print " ", al.seg_from.contig.id, counts[
            al.seg_from.contig.id], al.contradictingRTC()
    print inter
    for rid in inter:
        SeqIO.write(reads[rid], f, "fasta")
        print rid, reads[rid]
    f.close()
    f = open(os.path.join(dir, "reads_over.fasta"), "w")
    for rid in over:
        SeqIO.write(reads[rid], f, "fasta")
    f.close()
예제 #22
0
def main(flye_dir, output_dir, diploid):
    basic.ensure_dir_existance(output_dir)
    CreateLog(output_dir)
    print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"]))
    print("Modifications:")
    print subprocess.check_output(["git", "diff"])
    graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")
    edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta")
    dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump")
    if diploid:
        calculator = DipolidCalculator(150000)
    else:
        calculator = HaploidCalculator(150000)
    print "Reading graph from", graph_file
    graph = SimpleGraph()
    graph.ReadDot(graph_file)
    print "Reading sequences from", edge_file
    graph.FillSeq(edge_file, True)
    print "Splitting graph", edge_file
    componentRecords, edgecomp = constructComponentRecords(graph, calculator)
    print "Reading alignment dump from", dump_file
    rcnt = 0
    for rid, eids in AlignmentDumpParser(dump_file).parse():
        compids = set()
        eids = map(basic.Normalize, eids)
        for eid in eids:
            for compid in edgecomp[eid]:
                compids.add(compid)
        for compid in compids:
            comp_eids = [
                eid for eid in eids
                if eid in componentRecords[compid].component.e
            ]
            if comp_eids.__len__() == 0:
                print "GOPA", compid, compids, rid, eids
            componentRecords[compid].addRead(rid, eids)
        rcnt += 1
        if rcnt % 100000 == 0:
            print "Processed", rcnt, "reads"
    print "Filling flye repeat resolution results"
    flye_next = FillFlyeNext(componentRecords,
                             os.path.join(flye_dir, "flye.log"))
    for compRec in componentRecords:
        half = compRec.half()
        for norm_eid in compRec.unique:
            for eid in [norm_eid, basic.Reverse(norm_eid)]:
                if eid not in compRec.component.e:
                    assert not basic.isCanonocal(eid)
                    assert basic.Reverse(eid) in compRec.component.e
                    continue
                if compRec.component.e[eid].end in half:
                    if compRec.component.isBorder(
                            compRec.component.e[eid].end):
                        compRec.out += 1
                    if compRec.component.isBorder(
                            compRec.component.e[eid].start):
                        compRec.inc += 1
                if not compRec.component.isBorder(
                        compRec.component.e[eid].end):
                    if flye_next[eid] is None:
                        compRec.unresolved_connections += 1
                    else:
                        compRec.resolved_connections.append(
                            (eid, flye_next[eid]))
                        if flye_next[eid] not in compRec.component.e:
                            compRec.outside_connections += 1

    basic.ensure_dir_existance(output_dir)
    print "Printing components to disk"
    subdataset_dir = os.path.join(output_dir, "subdatasets")
    basic.ensure_dir_existance(subdataset_dir)
    order = range(componentRecords.__len__())
    order = sorted(order, key=lambda i: componentRecords[i].score())
    ordered_components = [
        componentRecords[order[i]] for i in range(len(order))
    ]
    componentRecords = ordered_components
    basic.ensure_dir_existance(os.path.join(output_dir, "pics"))
    for i, component in enumerate(componentRecords):
        comp_dir = os.path.join(subdataset_dir, str(i))
        component.dump(comp_dir)
        fig_name = os.path.join(comp_dir, "graph.dot")
        component.draw(fig_name, calculator)
        if component.component.__len__() <= 100:
            fig_file = os.path.join(output_dir, "pics", str(i) + ".dot")
            component.draw(fig_file, calculator)

    table_file = os.path.join(output_dir, "table.txt")
    print "Printing table to file", table_file
    f = open(table_file, "w")
    f.write(
        "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n"
    )
    for i, compRec in enumerate(componentRecords):
        comp = compRec.component
        f.write(" ".join([
            str(i),
            str(comp.v.__len__()),
            str(comp.e.__len__()),
            str(compRec.unique.__len__() * 2),
            str(compRec.inc),
            str(compRec.out),
            str(compRec.repeat_edges),
            str(compRec.unresolved_connections),
            str(compRec.resolved_connections.__len__()),
            str(compRec.outside_connections),
            str(compRec.zero),
            str(compRec.red),
            str(compRec.bad_border),
            str(compRec.overcovered_edges),
            str(compRec.score())
        ]) + "\n")
    f.close()
    table_file = os.path.join(output_dir, "list.txt")
    f = open(table_file, "w")
    for a in range(len(componentRecords)):
        f.write(str(a) + "\n")
    f.close()
예제 #23
0
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length):
    params.technology = "nano"
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    print " ".join(sys.argv)
    print "Reading graph"
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv"))
    graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"),
                  True)
    print "Extracting relevant graph component"
    edge_ids = edge_id.split(",")
    to_resolve = to_resolve.split(",")
    to_resolve = [(a, int(b))
                  for a, b in zip(to_resolve[0::2], to_resolve[1::2])]
    unique = uniqueNeighbours(edge_ids, graph, min_contig_length)

    if rf == "none":
        return
    print "Finding reads that align to", edge_ids
    reads_to_resolve = dict()  # type: Dict[str, List[str]]
    for eid, mult in to_resolve:
        reads_to_resolve[eid] = []
    for unique_edge, initial in unique:
        reads_to_resolve[initial] = []
    relevant_read_ids = set()
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if eid in edge_ids:
            relevant_read_ids.add(rid)
            print rid, eid
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if rid in relevant_read_ids and eid in reads_to_resolve:
            reads_to_resolve[eid].append(rid)
    for eid in reads_to_resolve:
        reads_to_resolve[eid] = list(set(reads_to_resolve[eid]))
    print "Reading reads"
    res_reads = ContigStorage()
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_by_name(rf):
        if read.id in relevant_read_ids:
            res_reads.add(Contig(read.seq, read.id))
            SeqIO.write(read, res, "fasta")
    res.close()
    random_down = open(os.path.join(dir, "random_down.fasta"), "w")
    cnt = 0
    for read in res_reads:
        if cnt % 5 == 0:
            SeqIO.write(read, random_down, "fasta")
        cnt += 1
    random_down.close()
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    lcf = open(os.path.join(dir, "contigs.lc"), "w")
    for eid, mult in to_resolve:
        repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]]
        print reads_to_resolve[eid]
        print map(str, repeat_reads)
        split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult,
                                    repeat_reads, min_contig_length)
        if split_contigs is None:
            print "Failed to resove edge", eid, "Aborting"
        print "Edge", eid, "was split into", mult, "copies"
        for contig, contig_reads in split_contigs:
            print contig.id
            SeqIO.write(contig, res, "fasta")
            lcf.write(contig.id + "\n")
            lcf.write(" ".join([r.id for r in contig_reads]) + "\n")
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    for unique_edge, initial in unique:
        print unique_edge.id
        SeqIO.write(unique_edge, res, "fasta")
        lcf.write(unique_edge.id + "\n")
        lcf.write(" ".join(reads_to_resolve[initial]) + "\n")
    res.close()
예제 #24
0
def main(model_file, k, dir, contigs_file, reads_file):
    # type: (str, int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.scores = ComplexScores()
    params.scores.load(open(model, "r"))
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        cnt = 0
        al0 = None
        others = []
        for al in read.alignments:
            if not al.contradictingRTC():
                cnt += 1
                al0 = al
            else:
                others.append(al)
        if cnt != 1 or len(others) == 0:
            continue
        print al0
        print others
        seg = al0.seg_from
        for al in others:
            if al.seg_from.interSize(seg) < k:
                seg = None
                break
            else:
                seg = al.seg_from.cap(seg)
        print seg
        if seg is None:
            continue
        al0 = al0.reduce(query=seg)
        others = [al.reduce(query=seg) for al in others]
        scorer = Scorer(params.scores)
        for al in others:
            a, b, c = scorer.scoreCommon(al0, al)
            print "win", a, b, c, len(seg)
        if len(seg) > 1000:
            for i in range(len(seg) / 1000):
                seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000)
                for al in others:
                    a, b, c = scorer.scoreCommon(al0.reduce(query=seg1),
                                                 al.reduce(query=seg1))
                    print "win1000", a, b, c, len(seg1)
        for al1 in others:
            for al2 in others:
                if al1 == al2:
                    continue
                a, b, c = scorer.scoreCommon(al1, al2)
                print "draw", a, b, c, len(seg)
예제 #25
0
 def __init__(self, dir):
     basic.ensure_dir_existance(dir)
     self.dir = dir
     self.cur_dir = 0
예제 #26
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"
예제 #27
0
import os
import shutil
import sys

from common import basic

in_dir = sys.argv[1]
out_dir = sys.argv[2]
basic.ensure_dir_existance(out_dir)
for f in os.listdir(in_dir):
    fig_dir = os.path.join(in_dir, f, "pictures")
    cur = 0
    nums = map(basic.parseNumber, os.listdir(fig_dir))
    last = max(*nums)
    f_name = None
    for f in os.listdir(fig_dir):
        if basic.parseNumber(f) == last:
            f_name = f
            break
    shutil.copy(os.path.join(fig_dir, f_name),
                os.path.join(out_dir, f + ".dot"))