Esempio n. 1
0
def main(contig_file, reads_file, sam_file, dir, contig_id):
    # type: (str, str, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    contigs = ContigCollection()
    contigs.loadFromFasta(open(contig_file, "r"))
    print "Contigs loaded"
    contig = contigs[contig_id]
    read_names = set()
    for rec in Samfile(open(sam_file, "r")):
        read_names.add(rec.query_name)
    reads = ReadCollection()
    cnt = 0
    for rec in SeqIO.parse_fasta(open(reads_file, "r")):
        if rec.id in read_names:
            rec.id = "Read" + str(cnt)
            reads.add(AlignedRead(rec))
            cnt += 1
    reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w"))
    print "Reads loaded", len(reads)
    reads.addAllRC()
    print "RC added", len(reads)

    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    aligner.alignReadCollection(reads, contigs)
    print "Reads aligned", len(reads)
    reads = reads.inter(contig.asSegment())
    print "Reads filtered", len(reads)
    sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left)
    for read in sorted_reads:
        print read
        for al in read.alignmentsTo(contig.asSegment()):
            print "\n".join(al.asMatchingStrings())
Esempio n. 2
0
def main(args):
    rf = args[2]
    dir = args[3]
    CreateLog(dir)
    disjointigs = ContigCollection().loadFromFasta(open(args[1], "r"), num_names=False)
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    clen = 5000000
    reads = ReadCollection().loadFromFasta(open(rf, "r"))
    tlen0 = sum(map(len, reads))
    for i in range(10):
        good_reads = set()
        for al in aligner.localAlign(reads, disjointigs):
            if not al.contradictingRTC(al.seg_to.contig.asSegment(), 500):
                good_reads.add(al.seg_from.contig.id)
        rf = os.path.join(dir, "reads" + str(i) + ".fasta")
        reads = reads.filter(lambda read: read.id not in good_reads).cleanCopy()
        tlen = sum(map(len, reads))
        reads.print_fasta(open(rf, "w"))
        l = tlen * clen / tlen0
        assembly_dir = os.path.join(dir, "assembly" + str(i))
        subprocess.check_call(["./bin/flye", "-o", assembly_dir, "-t", "8", "--pacbio-raw", rf, "--genome-size", str(l), "--no-trestle"])
        df= os.path.join(assembly_dir, "10-consensus", "consensus.fasta")
        disjointigs.addAll(ContigCollection().loadFromFasta(open(df, "r"), num_names=False))
        df = os.path.join(dir, "df" + str(i) + ".fasta")
        disjointigs.print_fasta(open(df, "w"))
Esempio n. 3
0
def main(ref_file, contig_size, rlen, cov, dir):
    basic.ensure_dir_existance(dir)
    all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    contig_file_name = os.path.join(dir, "contigs.fasta")
    contig_file = open(contig_file_name, "w")
    reads_file_name = os.path.join(dir, "reads.fasta")
    reads_file = open(reads_file_name, "w")
    for ref in all_contigs.unique():
        if len(ref) < contig_size:
            continue
        SeqIO.write(ref, contig_file, "fasta")
        for i in range(0, len(ref), max(1, rlen / cov)):
            read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence()
            SeqIO.write(read, reads_file, "fasta")
    reads_file.close()
    contig_file.close()
    print "Done"
    print contig_file_name
    print reads_file_name
Esempio n. 4
0
def main(contigs_file, parts_file, dir):
    contigs = ContigCollection().loadFromFasta(open(contigs_file, "r"))
    parts = ContigCollection().loadFromFasta(open(parts_file, "r"))
    basic.CreateLog(dir)
    aligner = Aligner(DirDistributor(dir))
    res = dict()
    for al in aligner.localAlign(parts, contigs):
        if al.seg_to.contig.id not in res:
            res[al.seg_to.contig.id] = []
            res[al.seg_to.contig.rc.id] = []
        res[al.seg_to.contig.id].append(al)
        res[al.seg_to.contig.rc.id].append(al.rc)
    for cname, arr in res.items():
        print cname
        arr = filter(
            lambda al: len(al.seg_to) > min(
                len(al.seg_to.contig) - 1000, 5000), arr)
        arr = sorted(arr, key=lambda al: al.seg_to.left)
        print arr
Esempio n. 5
0
 def alignReadsToSegments(self, reads, segments):
     # type: (ReadCollection, Iterable[Segment]) -> None
     segments = list(segments)
     seg_dict = dict()
     for i, seg in enumerate(segments):
         seg_dict[str(i + 1)] = seg
     contigs = map(lambda (i, seg): Contig(seg.Seq(), str(i + 1)), enumerate(segments))
     read_collection = ReadCollection().extendClean(reads)
     self.alignReadCollection(read_collection, ContigCollection(contigs))
     read_collection.contigsAsSegments(seg_dict)
     reads.mergeAlignments(read_collection)
Esempio n. 6
0
 def alignReadCollection(self, reads_collection, contigs):
     # type: (ReadCollection, Iterable[Contig]) -> None
     contig_collection = ContigCollection(contigs)
     contig_ids = set()
     for contig in contigs:
         if contig.rc.id not in contig_ids:
             contig_ids.add(contig.id)
     read_ids = set()
     for read in reads_collection:
         if read.rc.id not in read_ids:
             read_ids.add(read.id)
     contigs = filter(lambda contig: contig.id in contig_ids, contigs)
     reads = filter(lambda read: read.id in read_ids, reads_collection)
     reads_collection.fillFromSam(self.align(reads, contigs, "local"), contig_collection)
Esempio n. 7
0
def loadAll(handler):
    # type: (TokenReader) -> Tuple[Params, Aligner, ContigCollection, ReadCollection, DisjointigCollection, NewLineStorage, LineDotPlot]
    cl_params = Params()
    cl_params.load(handler)
    aligner = Aligner.load(handler)
    sys.stdout.info("Loading contigs")
    contigs = ContigCollection()
    contigs.load(handler)
    sys.stdout.info("Loading reads")
    reads = CreateReadCollection(cl_params.reads_file, cl_params.downsample)
    reads.loadFromFasta(open(cl_params.reads_file, "r"),
                        downsample=params.downsample)
    tmp_reads = reads.copy().addAllRC()
    sys.stdout.info("Loading disjointigs")
    disjointigs = DisjointigCollection()
    disjointigs.load(handler, tmp_reads)
    sys.stdout.info("Loading lines")
    lines = NewLineStorage(disjointigs, aligner)
    lines.load(handler, tmp_reads, contigs)
    sys.stdout.info("Loading dot plot")
    dot_plot = LineDotPlot(lines, aligner)
    dot_plot.load(handler)
    sys.stdout.info("Loading finished")
    return cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot
Esempio n. 8
0
def main(dir, contigs_file1, contigs_file2, unique_contigs_file):
    CreateLog(dir)
    sys.stdout.level = LogPriority.warning
    unique = ContigCollection().loadFromFasta(
        open(unique_contigs_file,
             "r"), False).filter(lambda contig: len(contig) > 5000)
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    contigs1 = ContigCollection().loadFromFasta(open(contigs_file1, "r"),
                                                False)
    cals1 = list(aligner.overlapAlign(unique.unique(), contigs1))
    transfers1, term1, all1 = extract_transfers(contigs1, cals1)
    contigs2 = ContigCollection().loadFromFasta(open(contigs_file2, "r"),
                                                False)
    cals2 = list(aligner.overlapAlign(unique.unique(), contigs2))
    transfers2, term2, all2 = extract_transfers(contigs2, cals2)
    missing1 = []
    missing2 = []
    different = dict()
    unresolved1 = []
    unresolved2 = []
    same = []
    for ucontig in list(unique) + [contig.rc for contig in unique]:
        uid = ucontig.id
        in1 = uid in all1
        in2 = uid in all2
        if not in1 and not in2:
            continue
        if not in1:
            missing1.append(uid)
        elif not in2:
            missing2.append(uid)
        else:
            if all1[uid][0] == all2[uid][0]:
                same.append(uid)
            elif uid in transfers1 and uid in transfers2:
                different[uid] = (all1[uid][0], all2[uid][0])
            elif uid in transfers1:
                unresolved2.append(uid)
            elif uid in transfers2:
                unresolved1.append(uid)
    out = open(os.path.join(dir, "contigs.txt"), "w")
    out.write("Different: " + str(different) + "\n")
    out.write("Unresolved1: " + str(unresolved1) + "\n")
    out.write("Unresolved2: " + str(unresolved2) + "\n")
    out.write("Same: " + str(same) + "\n")
    out.write("Missing1: " + str(missing1) + "\n")
    out.write("Missing2: " + str(missing2) + "\n")
    out.write("Contig1 transfers: " + str(transfers1) + "\n")
    out.write("Contig1 term: " + str(term1) + "\n")
    out.write("Contig2 transfers: " + str(transfers2) + "\n")
    out.write("Contig2 term: " + str(term2) + "\n")
    out.close()
    print contigs_file1, contigs_file2
    print len(different), len(unresolved1), len(unresolved2), len(
        missing1), len(missing2), len(same)
Esempio n. 9
0
def main(dir, contigs_files, reference_file, unique_contigs_file):
    CreateLog(dir)
    sys.stdout.level = LogPriority.warning
    ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False)
    unique = ContigCollection().loadFromFasta(open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000)
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    ref_als= list(aligner.overlapAlign(unique.unique(), ref))
    ref_transfers, ref_term, all_ref_als = extract_transfers(ref, ref_als)
    for uid in ref_term:
        ref_transfers[uid] = ref_term[uid]
    print "#", "file", "wrong", "unresolved", "correct", "missing"
    for i, contigs_file in enumerate(contigs_files):
        contigs = ContigCollection().loadFromFasta(open(contigs_file, "r"), False)
        contig_als = list(aligner.overlapAlign(unique.unique(), contigs))
        contig_transfers, contig_term, all_contig_als = extract_transfers(contigs, contig_als)
        missing = []
        wrong = dict()
        unresolved = []
        correct = []
        for uid in ref_transfers:
            if uid not in contig_transfers and uid not in contig_term:
                # print uid, "missing"
                missing.append(uid)
            elif uid in contig_transfers:
                if ref_transfers[uid][0] == contig_transfers[uid][0]:
                    # print uid, "correct"
                    correct.append(uid)
                else:
                    # print uid, "wrong", ref_transfers[uid][0].id, contig_transfers[uid][0].id
                    wrong[uid] = (ref_transfers[uid][0], contig_transfers[uid][0])
            else:
                if ref_transfers[uid][0] == contig_term[uid][0]:
                    # print uid, "correct"
                    correct.append(uid)
                else:
                    # print uid, "unresolved"
                    unresolved.append(uid)
        out = open(os.path.join(dir, "contigs_" + str(i) +".txt"), "w")
        out.write("Wrong: " + str(wrong) + "\n")
        out.write("Unresolved: " + str(unresolved) + "\n")
        out.write("Correct: " + str(correct) + "\n")
        out.write("Missing: " + str(missing) + "\n")
        out.write("Contig transfers: " + str(contig_transfers) + "\n")
        out.write("Contig term: " + str(contig_term) + "\n")
        out.write("Ref transfers: " + str(ref_transfers) + "\n")
        out.write("Ref als:\n")
        for c in all_ref_als:
            out.write(str(c) + "\n")
        out.write("Contig als:\n")
        for c in all_contig_als:
            out.write(str(c) + "\n")
        out.close()
        print "result", i, contigs_file, len(wrong), len(unresolved), len(correct), len(missing)
Esempio n. 10
0
def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
Esempio n. 11
0
def main(ref_file, segment, dir):
    ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    chr1 = ref["chr1"]
    if segment[0] < 0:
        segment = (-segment[0], -segment[1])
        chr1 = chr1.rc
    reads = ReadCollection()
    reads_list = []
    for i in range(segment[0], segment[1], 500):
        read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence())
        reads_list.append(read)
    chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:]
    chr1.rc.seq = basic.RC(chr1.seq)
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, ref)
    out = sys.stdout
    for read in reads_list:
        # print read
        out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n")
    out.close()
Esempio n. 12
0
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs
Esempio n. 13
0
def main(argv):
    sys.stdout.write("Started\n")
    dot_file = argv[1]
    edge_sequences = argv[2]
    reference_file = argv[3]
    alignment_file = argv[4]
    edges = ParseVertices(argv[5])
    output_file = argv[6]
    sys.stdout.write("Loading dot\n")
    dot = DotParser(open(dot_file, "r")).parse()
    edge_collection = ContigCollection().loadFromFasta(
        open(edge_sequences, "r"), True)
    graph = Graph().loadFromDot(edge_collection, dot)
    vertices = [graph.E[id].start.id for id in edges]
    graph.printToFile(sys.stdout)
    print vertices
    ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False)

    print "Looking for relevant"
    pq = PriorityQueue()
    for v in graph.V.values():
        if v.id in vertices:
            pq.push((0, v))
    visited = []
    while not pq.empty():
        d, v = pq.pop()
        if v in visited:
            continue
        visited.append(v)
        for e in v.inc:
            print e.id, e.start.id, e.end.id
            if d + len(e) < dist:
                pq.push((d + len(e), e.start))
        for e in v.out:
            print e.id, e.start.id, e.end.id
            if d + len(e) < dist:
                pq.push((d + len(e), e.end))
    print "Visited", len(visited)
    print map(str, list(visited))
    relevant = []
    edge_alignments = ReadCollection().loadFromFasta(open(edge_sequences,
                                                          "r")).addAllRC()
    for edge in graph.E.values():
        if edge.start in visited or edge.start.rc in visited:
            relevant.append(edge_alignments[edge.id])
    print "Loading sam"
    edge_alignments.fillFromSam(Samfile(open(alignment_file, "r")), ref)
    for rel in relevant:
        print rel.__str__()
    print "Collecting segments"
    segments = []
    chr1 = ref["chr1"]
    for edge in relevant:
        for al in edge.alignments:
            print al
            if al.seg_from.inter(edge.prefix(dist)):
                l = dist - al.seg_from.left
                contig = al.seg_to.contig
                start = al.seg_to.left
                segments.append(
                    Segment(contig, start, min(start + l, len(contig))))
                print segments[-1]
    tmp = []
    print "Rotating"
    for seg in segments:
        if seg.contig != chr1:
            seg = seg.RC()
        if seg.contig != chr1:
            print "WARNING", seg
        tmp.append(seg)
    segments = sorted(tmp, key=lambda seg: seg.left)
    print "All relevant segments"
    print "\n".join(map(str, segments))
    cur_seg = None
    interesting_segments = []
    print "Gluing"
    for seg in segments:
        if cur_seg is None:
            cur_seg = seg.copy()
            continue
        if cur_seg.right + 20000 < seg.left:
            interesting_segments.append(cur_seg.copy())
            cur_seg = seg.copy()
        else:
            cur_seg.right = max(cur_seg.right, seg.right)
    if cur_seg is not None:
        interesting_segments.append(cur_seg.copy())

    alignments = []
    for edge in edge_alignments:
        for al in edge.alignments:
            ok = False
            for seg in interesting_segments:
                if al.seg_to.inter(seg):
                    alignments.append(al)
    alignments = sorted(alignments, key=lambda al: al.seg_to.left)
    print "All relevant alignments"
    print "\n".join(map(str, alignments))

    print "Interesting segments:", len(interesting_segments), sum(
        map(len, interesting_segments))
    for seg in interesting_segments:
        print seg
    f = open(output_file, "w")
    tmp = []
    for seg in interesting_segments:
        SeqIO.write(SeqIO.SeqRecord(seg.Seq(), seg.__str__()), f, "fasta")
        tmp.append(seg.Seq())
    f.close()
    f1 = open(output_file + "1", "w")
    SeqIO.write(SeqIO.SeqRecord(("N" * 20000).join(tmp), "concat"), f1,
                "fasta")
Esempio n. 14
0
        als = filter(lambda al: not al.contradictingRTC(tail_size=params.bad_end_length), als)
        return self.filterLocal(als)

if __name__ == "__main__":
    dir = sys.argv[1]
    query = sys.argv[2]
    target = sys.argv[3]
    extra_params = sys.argv[4:]
    contra = "contra" in extra_params
    over = "over" in extra_params
    long = "long" in extra_params
    start = "start" in extra_params
    forward = "forward" in extra_params
    aln = Aligner(DirDistributor(dir))
    basic.CreateLog(dir)
    contigs = ContigCollection().loadFromFasta(open(target, "r"), False)
    for al in aln.localAlign(ReadCollection().loadFromFile(query), contigs):
        if start:
            if al.seg_to.contig.id.startswith("-"):
                al = al.rc
            if al.seg_to.left > 50:
                continue
        if over and al.contradictingRTC():
            continue
        if forward:
            if al.seg_to.contig.id.startswith("-"):
                al = al.rc
        if contra and (len(al) < 8000 or not al.contradictingRTC()):
            continue
        if long and len(al) < 5000:
            continue