Ejemplo n.º 1
0
cn.sort()

mers = "%d-mers" % options.k
print '#', total, 'total', mers
print '#', distinct, 'distinct', mers

table = SimpleTable(
    ['CopyNumber', 'Count', 'BasePct', 'BaseTotal', 'KmerPct', 'KmerTotal'],
    [], 'Kmer Copy Number (k=' + str(options.k) + ')')

btotal = 0.0
ktotal = 0.0
for n in cn:
    c = by_count[n]
    bpct = n * c * 100.0 / total
    btotal += bpct
    kpct = c * 100.0 / distinct
    ktotal += kpct
    table.add_row([
        n, c,
        "%.2f%%" % bpct,
        "%.2f%%" % btotal,
        "%.2f%%" % kpct,
        "%.2f%%" % ktotal
    ])

if options.o:
    table.print_output(options.o + ".kmer_copy_number", options.html)
else:
    print table.to_table()
Ejemplo n.º 2
0
def __write_output_data(agp_file=None,
                        flank_kmer=None,
                        asm_kmer=None,
                        contig_seqs=None,
                        asm_counts=None,
                        cg_ss_counts=None,
                        ue_ss_counts=None):
    #captured gap stats
    cg_seqs = []
    cg_sizes = []
    cg_dist = []
    cg_cn = []
    cg_gc = []

    #uncaptured end stats
    ue_seqs = []
    ue_dist = []
    ue_cn = []
    ue_gc = []

    print "Pulling gap end sequence..."
    agp = AgpFile(agp_file)
    scaffolds = agp.get_agp_scaffolds()

    for scaffold in scaffolds:
        print "Scaffold:", scaffold
        for record in agp.get_agp_file_record(scaffold):
            if (not agp.is_gap(scaffold, record)):
                ctg_seq = contig_seqs[agp.get_contig_id(scaffold, record)]
                if (len(ctg_seq) < options.e):
                    print "WARNING:", agp.get_contig_id(
                        scaffold,
                        record), " is less than ", options.e, ". Ignoring."
                    continue
            if (record == 1):
                #print "\tFirst contig..."
                #print "SEQ: ", contig_seqs[agp.get_contig_id(scaffold,record)][:options.e]
                (gc, dist, copy) = analyze_seq(
                    contig_seqs[agp.get_contig_id(scaffold,
                                                  record)][:options.e],
                    flank_kmer, asm_kmer, asm_counts)
                ue_dist.append(dist)
                ue_cn.append(copy)
                ue_gc.append(gc)
                ue_ss_counts = get_simple_sequences(
                    ue_ss_counts,
                    contig_seqs[agp.get_contig_id(scaffold,
                                                  record)][:options.e])
                #print "UE", ue_ss_counts
            if (record == len(agp.get_agp_file_record(scaffold))):
                #print "\tLast contig..."
                #print "SEQ: ", contig_seqs[agp.get_contig_id(scaffold,record)][-options.e:]

                (gc, dist, copy) = analyze_seq(
                    contig_seqs[agp.get_contig_id(scaffold,
                                                  record)][-options.e:],
                    flank_kmer, asm_kmer, asm_counts)
                ue_dist.append(dist)
                ue_cn.append(copy)
                ue_gc.append(gc)
                ue_ss_counts = get_simple_sequences(
                    ue_ss_counts,
                    contig_seqs[agp.get_contig_id(scaffold,
                                                  record)][-options.e:])
                #print "UE", ue_ss_counts
            if (agp.is_gap(scaffold, record)):
                #print "Gap..."
                left_ctg_record = record - 1
                right_ctg_record = record + 1

                if (len(contig_seqs[agp.get_contig_id(
                        scaffold, left_ctg_record)]) < options.e) or (len(
                            contig_seqs[agp.get_contig_id(
                                scaffold, right_ctg_record)]) < options.e):
                    print "Warning: This gap is flanked by a contig less than", options.e, "long. Skipping analysis."
                    continue
                left_seq = contig_seqs[agp.get_contig_id(
                    scaffold, left_ctg_record)][-options.e:]
                #left_seq = contig_seqs[agp.get_contig_id(scaffold,left_ctg_record)][:options.e]
                #print "LEFT: ", left_seq
                (left_gc, left_dist,
                 left_copy) = analyze_seq(left_seq, flank_kmer, asm_kmer,
                                          asm_counts)
                cg_ss_counts = get_simple_sequences(cg_ss_counts, left_seq)
                #print "CG", cg_ss_counts
                #print left_seq
                #print left_gc,left_dist,left_copy

                right_seq = contig_seqs[agp.get_contig_id(
                    scaffold, right_ctg_record)][:options.e]
                #right_seq = contig_seqs[agp.get_contig_id(scaffold,right_ctg_record)][-options.e:]
                #print "RIGHT SEQ: ", right_seq
                (right_gc, right_dist,
                 right_copy) = analyze_seq(right_seq, flank_kmer, asm_kmer,
                                           asm_counts)
                cg_ss_counts = get_simple_sequences(cg_ss_counts, right_seq)
                #print "CG", cg_ss_counts
                #print right_gc,right_dist,right_copy

                cg_sizes.append(agp.get_feature_length(scaffold, record))
                cg_dist.append((left_dist + right_dist) / 2)
                cg_gc.append((left_gc + right_gc) / 2)
                cg_cn.append((left_copy + right_copy) / 2)

    headers = ["Metric", "Uncaptured Ends", "Captured Gaps"]
    table = SimpleTable(headers, [], "Gap Analysis Metrics")

    ss_table = SimpleTable([
        "Sequence", "Uncaptured Ends Bases", "Uncaptured Ends Percent",
        "Captured Gap Bases", "Captured Gap Percent"
    ], [], "Gap Simple Sequence Analysis")

    if (len(cg_sizes) > 0):
        chart_tools.gen_histogram(cg_sizes, "Gap Sizes", "Number of Gaps",
                                  "Histogram of Captured Gap Sizes",
                                  options.header + ".cg_sizes")
        chart_tools.gen_histogram(cg_dist, "Gap Flank Complexity",
                                  "Number of Gaps",
                                  "Histogram of Gap Flank Complexity",
                                  options.header + ".cg_distinctness")
        chart_tools.gen_histogram(cg_cn, "Gap Flank Copy Number",
                                  "Number of Gaps",
                                  "Histogram of Gap Flank Copy Number",
                                  options.header + ".cg_copy_number")
        chart_tools.gen_histogram(cg_gc, "Gap Flank GC", "Number of Gaps",
                                  "Histogram of Gap Flank GC",
                                  options.header + ".cg_gc")

        table.add_row(["Number", len(ue_dist), len(cg_dist)])
        table.add_row([
            "Average Complexity",
            "%.0f" % (sum(ue_dist) / len(ue_dist)),
            "%.0f" % (sum(cg_dist) / len(cg_dist))
        ])
        table.add_row([
            "Less than " + str(options.l) + "% Complex",
            len(filter(lambda x: x < options.l, ue_dist)),
            len(filter(lambda x: x < options.l, cg_dist))
        ])
        table.add_row([
            "Average GC",
            "%.0f" % (sum(ue_gc) / len(ue_gc)),
            "%.0f" % (sum(cg_gc) / len(cg_gc))
        ])
        table.add_row([
            "Less than 30% GC",
            len(filter(lambda x: x < 30, ue_gc)),
            len(filter(lambda x: x < 30, cg_gc))
        ])
        table.add_row([
            "Greater than 70% GC",
            len(filter(lambda x: x > 70, ue_gc)),
            len(filter(lambda x: x > 70, cg_gc))
        ])
        table.add_row([
            "Average Copy Number",
            "%.0f" % (sum(ue_cn) / len(ue_cn)),
            "%.0f" % (sum(cg_cn) / len(cg_cn))
        ])
        ss_table.add_row([
            "End Bases",
            len(ue_dist) * options.e, "",
            len(cg_dist) * options.e, ""
        ])
        ss_table.add_row([
            "Total SS",
            sum(ue_ss_counts.itervalues()),
            "%.2f%%" % (float(sum(ue_ss_counts.itervalues()) * 100) /
                        (len(ue_dist) * options.e)),
            sum(cg_ss_counts.itervalues()),
            "%.2f%%" % (float(sum(cg_ss_counts.itervalues()) * 100) /
                        (len(cg_dist) * options.e))
        ])
        for n in cg_ss_counts:
            ss_table.add_row([
                n, ue_ss_counts[n],
                "%.2f%%" % (float(ue_ss_counts[n] * 100) /
                            (len(ue_dist) * options.e)), cg_ss_counts[n],
                "%.2f%%" % (float(cg_ss_counts[n] * 100) /
                            (len(cg_dist) * options.e))
            ])

    else:
        print "No captured gaps."
        table.add_row(["Number", len(ue_dist), len(cg_dist)])
        table.add_row([
            "Average Uniqueness",
            "%.0f" % (sum(ue_dist) / len(ue_dist)), "N/A"
        ])
        table.add_row([
            "Less than " + str(options.l) + "% distinct",
            "%.2f" % (len(filter(lambda x: x < options.l, ue_dist))),
            "%.2f" % (len(filter(lambda x: x < options.l, cg_dist)))
        ])
        table.add_row(
            ["Average GC",
             "%.0f" % (sum(ue_gc) / len(ue_gc)), "N/A"])
        table.add_row([
            "Less than 30% GC",
            len(filter(lambda x: x < 30, ue_gc)),
            len(filter(lambda x: x < 30, cg_gc))
        ])
        table.add_row([
            "Greater than 70% GC",
            len(filter(lambda x: x > 70, ue_gc)),
            len(filter(lambda x: x > 70, cg_gc))
        ])
        table.add_row(
            ["Average Copy Number",
             "%.0f" % (sum(ue_cn) / len(ue_cn)), "N/A"])
        ss_table.add_row(["End Bases", len(ue_dist) * options.e, "", "NA", ""])
        ss_table.add_row([
            "Total SS",
            sum(ue_ss_counts.itervalues()),
            "%.2f%%" % (float(sum(ue_ss_counts.itervalues()) * 100) /
                        (len(ue_dist) * options.e)), "NA", "NA"
        ])
        for n in cg_ss_counts:
            ss_table.add_row([
                n, ue_ss_counts[n],
                "%.2f%%" % (float(ue_ss_counts[n] * 100) /
                            (len(ue_dist) * options.e)), "NA", "NA"
            ])

    chart_tools.gen_histogram(ue_dist, "End Distinctness",
                              "Number of Uncaptured Ends",
                              "Histogram of End Complexity",
                              options.header + ".ue_distinctness")
    chart_tools.gen_histogram(ue_cn, "End Copy Number",
                              "Number of Uncaptured Ends",
                              "Histogram of End Copy Number",
                              options.header + ".ue_copy_number")
    chart_tools.gen_histogram(ue_gc, "End GC", "Number of Uncaptured Ends",
                              "Histogram of End GC", options.header + ".ue_gc")

    table.print_output(options.t_header + ".gap_analysis", options.html)
    ss_table.print_output(options.t_header + ".gap_ss_analysis", options.html)