Beispiel #1
0
def write_all_wiggles(files1, files2, corrected_group, basecall_subgroups,
                      obs_filter, test_type, min_test_vals, stats_fn,
                      fishers_method_offset, wig_base, wig_types):
    stats_file_exists = stats_fn is not None and os.path.isfile(stats_fn)
    include_stats = 'pvals' in wig_types or 'qvals' in wig_types
    if include_stats and stats_file_exists:
        if VERBOSE: sys.stderr.write('Loading statistics from file.\n')
        all_stats = ns.parse_stats(stats_fn)

    if VERBOSE: sys.stderr.write('Parsing FAST5 files.\n')
    raw_read_coverage1 = nh.parse_fast5s(files1, corrected_group,
                                         basecall_subgroups)
    raw_read_coverage1 = nh.filter_reads(raw_read_coverage1, obs_filter)

    group1_name = '' if files2 is None else 'group1'
    if files2 is not None:
        raw_read_coverage2 = nh.parse_fast5s(files2, corrected_group,
                                             basecall_subgroups)
        raw_read_coverage2 = nh.filter_reads(raw_read_coverage2, obs_filter)
        if include_stats and not stats_file_exists:
            if VERBOSE: sys.stderr.write('Calculating statistics.\n')
            all_stats = ns.get_all_significance(raw_read_coverage1,
                                                raw_read_coverage2, test_type,
                                                min_test_vals, stats_fn,
                                                fishers_method_offset)

        if VERBOSE: sys.stderr.write('Writing wiggles.\n')
        if 'coverage' in wig_types:
            write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME)
        if 'signal_sd' in wig_types:
            write_signal_sd_wig(raw_read_coverage2, wig_base, GROUP2_NAME)
        if 'length' in wig_types:
            write_length_wig(raw_read_coverage2, wig_base, GROUP2_NAME)

        # need to do signal and difference call once either with or
        # w/o second set of files (unlike coverage, sds and length
        if 'signal' in wig_types or 'difference' in wig_types:
            write_signal_and_diff_wigs(raw_read_coverage1, raw_read_coverage2,
                                       wig_base, group1_name, 'signal'
                                       in wig_types, 'difference' in wig_types)
    else:
        if VERBOSE: sys.stderr.write('Writing wiggles.\n')
        if 'signal' in wig_types or 'difference' in wig_types:
            write_signal_and_diff_wigs(raw_read_coverage1, None, wig_base,
                                       group1_name, 'signal' in wig_types,
                                       'difference' in wig_types)

    if 'coverage' in wig_types:
        write_cov_wig(raw_read_coverage1, wig_base, group1_name)
    if 'signal_sd' in wig_types:
        write_signal_sd_wig(raw_read_coverage1, wig_base, group1_name)
    if 'length' in wig_types:
        write_length_wig(raw_read_coverage1, wig_base, group1_name)
    if 'pvals' in wig_types or 'qvals' in wig_types:
        write_pvals_and_qvals_wig(all_stats, wig_base, 'pvals' in wig_types,
                                  'qvals' in wig_types)

    return
Beispiel #2
0
def write_most_signif(files1, files2, num_regions, qval_thresh,
                      corrected_group, basecall_subgroups, seqs_fn, num_bases,
                      test_type, obs_filter, min_test_vals, stats_fn, fasta_fn,
                      fishers_method_offset):
    calc_stats = stats_fn is None or not os.path.isfile(stats_fn)
    if not calc_stats:
        if VERBOSE: sys.stderr.write('Loading statistics from file.\n')
        all_stats = ns.parse_stats(stats_fn)

    if calc_stats or fasta_fn is None:
        if VERBOSE: sys.stderr.write('Parsing files.\n')
        raw_read_coverage1 = nh.parse_fast5s(files1, corrected_group,
                                             basecall_subgroups)
        raw_read_coverage2 = nh.parse_fast5s(files2, corrected_group,
                                             basecall_subgroups)
        raw_read_coverage1 = nh.filter_reads(raw_read_coverage1, obs_filter)
        raw_read_coverage2 = nh.filter_reads(raw_read_coverage2, obs_filter)

    if calc_stats:
        if VERBOSE: sys.stderr.write('Calculating statistics.\n')
        all_stats = ns.get_all_significance(raw_read_coverage1,
                                            raw_read_coverage2, test_type,
                                            min_test_vals, stats_fn,
                                            fishers_method_offset)

    plot_intervals = ns.get_most_signif_regions(all_stats, num_bases,
                                                num_regions, qval_thresh)
    if fasta_fn is None:
        reg_seqs = get_region_sequences(plot_intervals, raw_read_coverage1,
                                        raw_read_coverage2, num_bases,
                                        corrected_group)
    else:
        fasta_records = nh.parse_fasta(fasta_fn)
        reg_seqs = [(p_int, fasta_records[chrm][start:start + num_bases])
                    for p_int, (chrm, start, strand,
                                reg_name) in plot_intervals
                    if chrm in fasta_records]

    # get reads overlapping each region
    if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n')
    with open(seqs_fn, 'w') as seqs_fp:
        for reg_i, reg_seq in reg_seqs:
            chrm, start, strand, stat = next(
                p_int for p_reg_i, p_int in plot_intervals if p_reg_i == reg_i)
            if strand == '-':
                reg_seq = nh.rev_comp(reg_seq)
            seqs_fp.write('>{0}::{1:d}::{2} {3}\n{4}\n'.format(
                chrm, start, strand, stat, ''.join(reg_seq)))

    return
Beispiel #3
0
def write_wiggle(files, corrected_group, wiggle_fn, basecall_subgroups,
                 obs_filter):
    if VERBOSE: sys.stderr.write('Parsing files.\n')
    raw_read_coverage = parse_fast5s(files, corrected_group,
                                     basecall_subgroups)
    raw_read_coverage = filter_reads(raw_read_coverage, obs_filter)

    if VERBOSE: sys.stderr.write('Calculating read coverage.\n')
    wiggle_cov = []
    for chrom, reads_data in raw_read_coverage.items():
        max_end = max(r_data.end for r_data in reads_data)
        chrom_coverage = np.zeros(max_end, dtype=np.int_)
        for r_data in reads_data:
            chrom_coverage[r_data.start:r_data.end] += 1
        wiggle_cov.append((chrom, chrom_coverage))

    if VERBOSE: sys.stderr.write('Writing wiggle.\n')
    with open(wiggle_fn, 'w') as wig_fp:
        wig_fp.write(
            'track type=wiggle_0 name={0} description={0}\n'.format(wiggle_fn))
        for chrm, chrm_cov in wiggle_cov:
            wig_fp.write("variableStep chrom={} span=1\n".format(chrm))
            wig_fp.write('\n'.join([
                str(int(pos) + 1) + " " + str(int(val))
                for pos, val in enumerate(chrm_cov) if val > 0
            ]) + '\n')

    return