Ejemplo n.º 1
0
def _offline_counter(args):
    """ Offline counting from SAM/BAM file. """
    # Offline counting from SAM/BAM file:
    counts = read_counter.count_reads(args.bam.name,
                                      in_format=args.f,
                                      min_aln_qual=args.a,
                                      verbose=not args.Q)
    counts = OrderedDict(six.iteritems(counts))

    calc_words = [int(k) for k in args.k.split(",")]

    data = OrderedDict()

    # Calculate sequence properties:
    if args.z is not None:
        lengths, gc_contents, word_freqs = {}, {}, defaultdict(
            lambda: defaultdict(dict))
        ref_iter = seq_util.read_seq_records(args.z)
        if not args.Q:
            sys.stderr.write("Calculating sequence features:\n")
            ref_iter = tqdm.tqdm(ref_iter)

        for ref in ref_iter:
            # Augment counts dictionary with missing reference entries:
            if ref.id not in counts:
                counts[ref.id] = 0
            lengths[ref.id] = len(ref)
            gc_contents[ref.id] = seq_util.gc_content(str(ref.seq))
            if args.k is not None:
                for word_size in calc_words:
                    bf = seq_util.word_composition(ref.seq, word_size)
                    for word, count in six.iteritems(bf):
                        word_freqs[word_size][
                            ref.id][word] = float(count) / len(ref)

        data['Length'] = [lengths[tr] for tr in six.iterkeys(counts)]
        data['GC_content'] = [gc_contents[tr] for tr in six.iterkeys(counts)]

    data['Reference'] = list(counts.keys())
    data['Count'] = list(counts.values())

    # Calculate word frequencies:
    if args.k is not None and args.z:
        for ks in calc_words:
            for word in next(iter((word_freqs[ks].values()))).keys():
                tmp = []
                for ref in counts.keys():
                    tmp.append(word_freqs[ks][ref][word])
                data[word] = tmp

    data_frame = pd.DataFrame(data)
    data_frame = data_frame.sort_values(['Count', 'Reference'],
                                        ascending=False)

    if args.t is not None:
        data_frame.to_csv(args.t, sep='\t', index=False)

    if args.p is not None:
        misc.pickle_dump(data, args.p)
Ejemplo n.º 2
0
def _online_counter(args):
    """ Online counting from SAM stream. """
    # Open counts stream:
    counts_iter = read_counter.count_reads_realtime(alignment_file='-',
                                                    in_format=args.f,
                                                    min_aln_qual=args.a,
                                                    verbose=not args.Q,
                                                    yield_freq=args.F)

    for counts in counts_iter:
        data_frame = pd.DataFrame(
            OrderedDict([('Reference', list(counts.keys())),
                         ('Count', list(counts.values()))]))
        data_frame = data_frame.sort_values(['Count', 'Reference'],
                                            ascending=False)

        if args.t is not None:
            data_frame.to_csv(args.t, sep='\t', index=False)
        if args.p is not None:
            misc.pickle_dump(counts, args.p)
Ejemplo n.º 3
0
                    type=str,
                    help="Save pickled results in this file.",
                    default=None)
parser.add_argument('input_fastx',
                    nargs='?',
                    help='Input (default: stdin).',
                    type=argparse.FileType('r'),
                    default=sys.stdin)

if __name__ == '__main__':
    args = parser.parse_args()

    in_format = args.f
    input_iterator = seq_util.read_seq_records(args.input_fastx,
                                               format=in_format)

    total_bases = 0
    for record in input_iterator:
        total_bases += len(record)
    results = {'total_bases': total_bases}
    print("Total bases\t{}".format(total_bases))

    if args.s is not None:
        results['genome_size'] = args.s
        results['coverage'] = float(total_bases) / args.s
        print("Genome size\t{}".format(results['genome_size']))
        print("Coverage\t{}".format(results['coverage']))

    if args.p is not None:
        misc.pickle_dump(results, args.p)
Ejemplo n.º 4
0
        verbose=verbose)
    read_stats = err_read_stats['read_stats']
    error_stats = err_read_stats['events']
    base_stats = err_read_stats['base_stats']
    indel_stats = err_read_stats['indel_dists']

    read_qual_qc(read_stats, plotter, args.i)
    base_stats_qc(base_stats, plotter)
    error_stat_qc(error_stats, plotter, context_sizes, ommit_diagonal=True)
    indel_dist_qc(indel_stats, plotter)

    pileup_stats = None
    if not args.x:
        pileup_stats = bam_stats.pileup_stats(args.bam,
                                              region=args.c,
                                              verbose=verbose)
        ref_qual_qc(pileup_stats, plotter, verbose)

    plotter.close()

    # Dump results of parsing into output pickle:
    rd = {
        'tag': tag,
        'read_stats': read_stats,
        'error_stats': error_stats,
        'indel_stats': indel_stats,
        'pileup_stats': pileup_stats,
        'base_stats': base_stats
    }
    misc.pickle_dump(rd, args.p)
Ejemplo n.º 5
0
    plt.tight_layout()
    plotter.pages.savefig()

    stats.plot(kind='barh', subplots=True, legend=False, sharex=False)
    plt.tight_layout()
    plotter.pages.savefig()

    match.plot(kind='barh', subplots=True, legend=False)
    plt.tight_layout()
    plotter.pages.savefig()

    miss.plot(kind='barh', subplots=True, legend=False, sharex=False)
    plt.tight_layout()
    plotter.pages.savefig()

    novel.plot(kind='barh', subplots=True, legend=False, sharex=False)
    plt.tight_layout()
    plotter.pages.savefig()

    plotter.close()

    if args.p is not None:
        p = {
            'total': total,
            'stats': stats,
            'match': match,
            'miss': miss,
            'novel': novel
        }
        misc.pickle_dump(p, args.p)
Ejemplo n.º 6
0
                                  verbose=not args.Q)
    read_stats['tag'] = tag
    base_stats = read_stats['base_stats']
    precision_stats = read_stats['read_stats']

    base_stats_qc(base_stats, plotter)
    modes = read_precision_qc(precision_stats, plotter)

    plotter.close()

    global_stats = OrderedDict([
        ('Accuracy', [read_stats['base_stats']['accuracy']]),
        ('AccuracyMode', modes['accuracy_mode']),
        ('Identity', [read_stats['base_stats']['identity']]),
        ('IdentityMode', modes['identity_mode']),
        ('Mapped', [read_stats['mapped']]),
        ('Unmapped', [read_stats['unmapped']]),
        ('Tag', [read_stats['tag']]),
    ])
    global_stats = pd.DataFrame(global_stats)

    if args.g is not None:
        global_stats.to_csv(args.g, sep="\t", index=False)

    if args.l is not None:
        read_df = pd.DataFrame(precision_stats)
        read_df.to_csv(args.l, sep="\t", index=False)

    if args.p is not None:
        misc.pickle_dump(read_stats, args.p)
Ejemplo n.º 7
0
        {'PerQuerySim': stats['PerQueryBaseSim']},
        title="Distribution of percent bases with matched alignment",
        xlab="Percent bases with matched alignment",
        ylab="Count",
        legend=False)

    plotter.plot_histograms(
        {'PerQuerySimClipped': stats['PerQueryBaseSimClipped']},
        title=
        "Distribution of percent bases with matched alignment (with clipping)",
        xlab="Percent bases with matched alignment",
        ylab="Count",
        legend=False)

    plotter.close()

    if args.p is not None:
        misc.pickle_dump(dict(stats), args.p)

    if args.t is not None:
        data_map = stats.copy()
        del data_map['PerQueryBaseSim']
        del data_map['PerQueryBaseSimClipped']
        for bam in data_map['BamFiles']:
            del data_map[bam]
        del data_map['BamFiles']
        data_map = OrderedDict(
            (key, [value]) for key, value in six.iteritems(data_map))
        data_frame = pd.DataFrame(data_map)
        data_frame.to_csv(args.t, sep="\t", index=False)