def main(contigs_file, parts_file, dir):
    contigs = ContigCollection().loadFromFasta(open(contigs_file, "r"))
    parts = ContigCollection().loadFromFasta(open(parts_file, "r"))
    basic.CreateLog(dir)
    aligner = Aligner(DirDistributor(dir))
    res = dict()
    for al in aligner.localAlign(parts, contigs):
        if al.seg_to.contig.id not in res:
            res[al.seg_to.contig.id] = []
            res[al.seg_to.contig.rc.id] = []
        res[al.seg_to.contig.id].append(al)
        res[al.seg_to.contig.rc.id].append(al.rc)
    for cname, arr in res.items():
        print cname
        arr = filter(
            lambda al: len(al.seg_to) > min(
                len(al.seg_to.contig) - 1000, 5000), arr)
        arr = sorted(arr, key=lambda al: al.seg_to.left)
        print arr
Exemple #2
0
def main(flye_dir, rf, dir, edge_id, k):
    params.technology = "nano"
    params.k = k
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    print "Reading graph"
    graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa"))
    print "Parsing edge mapping"
    id_map = parseUPaths(flye_dir)
    edge_ids = edge_id.split(",")
    print "Extracting relevant graph component"
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    unique = dict()
    for eid in edge_ids:
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "l"] = NamedSequence(
                        basic.RC(e.seq[:5000]), e.id[1:] + "l")
                else:
                    unique[e.id + "r"] = NamedSequence(e.seq[-5000:],
                                                       e.id + "r")
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "r"] = NamedSequence(
                        basic.RC(e.seq[-5000:]), e.id[1:] + "r")
                else:
                    unique[e.id + "l"] = NamedSequence(e.seq[:5000],
                                                       e.id + "l")

    for c in unique.values():
        print c.id
        SeqIO.write(c, res, "fasta")
    res.close()
    old_ids = []
    for eid in edge_ids:
        for olde in id_map[eid[len("edge_"):]]:
            old_ids.append(basic.Normalize(olde))
    print "Finding reads that align to", edge_ids
    print "Old ids:", old_ids
    relevant_read_ids = set()
    for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"),
                  "r").readlines():
        s = s.split()
        if s[0] != "Aln":
            continue
        if s[6].split("_")[1] in old_ids:
            relevant_read_ids.add(s[2][1:])
            print s[2][1:], s[6].split("_")[1]
    print "Reading reads"
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_fasta(open(rf, "r")):
        if read.id in relevant_read_ids and len(read) > k * 1.2:
            SeqIO.write(read, res, "fasta")
    res.close()
Exemple #3
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"
Exemple #4
0
        # type: (List[AlignmentPiece]) -> List[AlignmentPiece]
        als = filter(lambda al: not al.contradictingRTC(tail_size=params.bad_end_length), als)
        return self.filterLocal(als)

if __name__ == "__main__":
    dir = sys.argv[1]
    query = sys.argv[2]
    target = sys.argv[3]
    extra_params = sys.argv[4:]
    contra = "contra" in extra_params
    over = "over" in extra_params
    long = "long" in extra_params
    start = "start" in extra_params
    forward = "forward" in extra_params
    aln = Aligner(DirDistributor(dir))
    basic.CreateLog(dir)
    contigs = ContigCollection().loadFromFasta(open(target, "r"), False)
    for al in aln.localAlign(ReadCollection().loadFromFile(query), contigs):
        if start:
            if al.seg_to.contig.id.startswith("-"):
                al = al.rc
            if al.seg_to.left > 50:
                continue
        if over and al.contradictingRTC():
            continue
        if forward:
            if al.seg_to.contig.id.startswith("-"):
                al = al.rc
        if contra and (len(al) < 8000 or not al.contradictingRTC()):
            continue
        if long and len(al) < 5000:
Exemple #5
0
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length):
    params.technology = "nano"
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    print " ".join(sys.argv)
    print "Reading graph"
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv"))
    graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"),
                  True)
    print "Extracting relevant graph component"
    edge_ids = edge_id.split(",")
    to_resolve = to_resolve.split(",")
    to_resolve = [(a, int(b))
                  for a, b in zip(to_resolve[0::2], to_resolve[1::2])]
    unique = uniqueNeighbours(edge_ids, graph, min_contig_length)

    if rf == "none":
        return
    print "Finding reads that align to", edge_ids
    reads_to_resolve = dict()  # type: Dict[str, List[str]]
    for eid, mult in to_resolve:
        reads_to_resolve[eid] = []
    for unique_edge, initial in unique:
        reads_to_resolve[initial] = []
    relevant_read_ids = set()
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if eid in edge_ids:
            relevant_read_ids.add(rid)
            print rid, eid
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if rid in relevant_read_ids and eid in reads_to_resolve:
            reads_to_resolve[eid].append(rid)
    for eid in reads_to_resolve:
        reads_to_resolve[eid] = list(set(reads_to_resolve[eid]))
    print "Reading reads"
    res_reads = ContigStorage()
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_by_name(rf):
        if read.id in relevant_read_ids:
            res_reads.add(Contig(read.seq, read.id))
            SeqIO.write(read, res, "fasta")
    res.close()
    random_down = open(os.path.join(dir, "random_down.fasta"), "w")
    cnt = 0
    for read in res_reads:
        if cnt % 5 == 0:
            SeqIO.write(read, random_down, "fasta")
        cnt += 1
    random_down.close()
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    lcf = open(os.path.join(dir, "contigs.lc"), "w")
    for eid, mult in to_resolve:
        repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]]
        print reads_to_resolve[eid]
        print map(str, repeat_reads)
        split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult,
                                    repeat_reads, min_contig_length)
        if split_contigs is None:
            print "Failed to resove edge", eid, "Aborting"
        print "Edge", eid, "was split into", mult, "copies"
        for contig, contig_reads in split_contigs:
            print contig.id
            SeqIO.write(contig, res, "fasta")
            lcf.write(contig.id + "\n")
            lcf.write(" ".join([r.id for r in contig_reads]) + "\n")
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    for unique_edge, initial in unique:
        print unique_edge.id
        SeqIO.write(unique_edge, res, "fasta")
        lcf.write(unique_edge.id + "\n")
        lcf.write(" ".join(reads_to_resolve[initial]) + "\n")
    res.close()
Exemple #6
0
def main(contigs_file, contig_name, reads_file, dir, k):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    contig = contigs[contig_name]
    contigs = ContigStorage()
    contigs.add(contig)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    als = list(aligner.localAlign(reads.unique(), contigs))
    tmp = []
    for al in als:
        if al.seg_to.contig != contig:
            al = al.rc
        tmp.append(al)
    als = tmp
    als = sorted(als,
                 key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right
                 - al.seg_to.left)
    counts = dict()
    for al in als:
        counts[al.seg_from.contig.id] = 0
    for al in als:
        if len(al) > k:
            counts[al.seg_from.contig.id] += 1
    w = 20
    f = open(os.path.join(dir, "reads.fasta"), "w")
    over = set()
    inter = set()
    for al in als:
        if len(al) < k:
            continue
        inter.add(basic.Normalize(al.seg_from.contig.id))
        if not al.contradictingRTC():
            over.add(basic.Normalize(al.seg_from.contig.id))
        m = al.matchingSequence(True)
        tmp = []
        for i in range(len(contig) / w + 1):
            tmp.append([])
        for a, b in m.matches:
            tmp[b / w].append((a, b))
        for i in range(len(contig) / w):
            if i + 1 < len(tmp) and len(tmp[i + 1]) > 0:
                tmp[i].append(tmp[i + 1][0])
        for i in range(len(contig) / w):
            seg = contig.segment(i * w, i * w + w)
            if al.seg_to.inter(seg):
                if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length:
                    sys.stdout.write("B")
                elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length:
                    sys.stdout.write("E")
                else:
                    if len(tmp[i]) == 0:
                        sys.stdout.write("*")
                    else:
                        a = tmp[i][-1][0] - tmp[i][0][0]
                        b = tmp[i][-1][1] - tmp[i][0][1]
                        if a - b > 30:
                            sys.stdout.write("I")
                        elif a - b > 15:
                            sys.stdout.write("i")
                        elif a - b < -30:
                            sys.stdout.write("D")
                        elif a - b < -15:
                            sys.stdout.write("d")
                        else:
                            sys.stdout.write(
                                str(min(8,
                                        max(a, b) + 1 - len(tmp[i]))))
            else:
                sys.stdout.write("*")
        print " ", al.seg_from.contig.id, counts[
            al.seg_from.contig.id], al.contradictingRTC()
    print inter
    for rid in inter:
        SeqIO.write(reads[rid], f, "fasta")
        print rid, reads[rid]
    f.close()
    f = open(os.path.join(dir, "reads_over.fasta"), "w")
    for rid in over:
        SeqIO.write(reads[rid], f, "fasta")
    f.close()