Beispiel #1
0
def main(dir, contigs_file1, contigs_file2, unique_contigs_file):
    CreateLog(dir)
    sys.stdout.level = LogPriority.warning
    unique = ContigCollection().loadFromFasta(
        open(unique_contigs_file,
             "r"), False).filter(lambda contig: len(contig) > 5000)
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    contigs1 = ContigCollection().loadFromFasta(open(contigs_file1, "r"),
                                                False)
    cals1 = list(aligner.overlapAlign(unique.unique(), contigs1))
    transfers1, term1, all1 = extract_transfers(contigs1, cals1)
    contigs2 = ContigCollection().loadFromFasta(open(contigs_file2, "r"),
                                                False)
    cals2 = list(aligner.overlapAlign(unique.unique(), contigs2))
    transfers2, term2, all2 = extract_transfers(contigs2, cals2)
    missing1 = []
    missing2 = []
    different = dict()
    unresolved1 = []
    unresolved2 = []
    same = []
    for ucontig in list(unique) + [contig.rc for contig in unique]:
        uid = ucontig.id
        in1 = uid in all1
        in2 = uid in all2
        if not in1 and not in2:
            continue
        if not in1:
            missing1.append(uid)
        elif not in2:
            missing2.append(uid)
        else:
            if all1[uid][0] == all2[uid][0]:
                same.append(uid)
            elif uid in transfers1 and uid in transfers2:
                different[uid] = (all1[uid][0], all2[uid][0])
            elif uid in transfers1:
                unresolved2.append(uid)
            elif uid in transfers2:
                unresolved1.append(uid)
    out = open(os.path.join(dir, "contigs.txt"), "w")
    out.write("Different: " + str(different) + "\n")
    out.write("Unresolved1: " + str(unresolved1) + "\n")
    out.write("Unresolved2: " + str(unresolved2) + "\n")
    out.write("Same: " + str(same) + "\n")
    out.write("Missing1: " + str(missing1) + "\n")
    out.write("Missing2: " + str(missing2) + "\n")
    out.write("Contig1 transfers: " + str(transfers1) + "\n")
    out.write("Contig1 term: " + str(term1) + "\n")
    out.write("Contig2 transfers: " + str(transfers2) + "\n")
    out.write("Contig2 term: " + str(term2) + "\n")
    out.close()
    print contigs_file1, contigs_file2
    print len(different), len(unresolved1), len(unresolved2), len(
        missing1), len(missing2), len(same)
def main(dir, contigs_files, reference_file, unique_contigs_file):
    CreateLog(dir)
    sys.stdout.level = LogPriority.warning
    ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False)
    unique = ContigCollection().loadFromFasta(open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000)
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    ref_als= list(aligner.overlapAlign(unique.unique(), ref))
    ref_transfers, ref_term, all_ref_als = extract_transfers(ref, ref_als)
    for uid in ref_term:
        ref_transfers[uid] = ref_term[uid]
    print "#", "file", "wrong", "unresolved", "correct", "missing"
    for i, contigs_file in enumerate(contigs_files):
        contigs = ContigCollection().loadFromFasta(open(contigs_file, "r"), False)
        contig_als = list(aligner.overlapAlign(unique.unique(), contigs))
        contig_transfers, contig_term, all_contig_als = extract_transfers(contigs, contig_als)
        missing = []
        wrong = dict()
        unresolved = []
        correct = []
        for uid in ref_transfers:
            if uid not in contig_transfers and uid not in contig_term:
                # print uid, "missing"
                missing.append(uid)
            elif uid in contig_transfers:
                if ref_transfers[uid][0] == contig_transfers[uid][0]:
                    # print uid, "correct"
                    correct.append(uid)
                else:
                    # print uid, "wrong", ref_transfers[uid][0].id, contig_transfers[uid][0].id
                    wrong[uid] = (ref_transfers[uid][0], contig_transfers[uid][0])
            else:
                if ref_transfers[uid][0] == contig_term[uid][0]:
                    # print uid, "correct"
                    correct.append(uid)
                else:
                    # print uid, "unresolved"
                    unresolved.append(uid)
        out = open(os.path.join(dir, "contigs_" + str(i) +".txt"), "w")
        out.write("Wrong: " + str(wrong) + "\n")
        out.write("Unresolved: " + str(unresolved) + "\n")
        out.write("Correct: " + str(correct) + "\n")
        out.write("Missing: " + str(missing) + "\n")
        out.write("Contig transfers: " + str(contig_transfers) + "\n")
        out.write("Contig term: " + str(contig_term) + "\n")
        out.write("Ref transfers: " + str(ref_transfers) + "\n")
        out.write("Ref als:\n")
        for c in all_ref_als:
            out.write(str(c) + "\n")
        out.write("Contig als:\n")
        for c in all_contig_als:
            out.write(str(c) + "\n")
        out.close()
        print "result", i, contigs_file, len(wrong), len(unresolved), len(correct), len(missing)
def main(ref_file, contig_size, rlen, cov, dir):
    basic.ensure_dir_existance(dir)
    all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    contig_file_name = os.path.join(dir, "contigs.fasta")
    contig_file = open(contig_file_name, "w")
    reads_file_name = os.path.join(dir, "reads.fasta")
    reads_file = open(reads_file_name, "w")
    for ref in all_contigs.unique():
        if len(ref) < contig_size:
            continue
        SeqIO.write(ref, contig_file, "fasta")
        for i in range(0, len(ref), max(1, rlen / cov)):
            read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence()
            SeqIO.write(read, reads_file, "fasta")
    reads_file.close()
    contig_file.close()
    print "Done"
    print contig_file_name
    print reads_file_name
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs