Exemple #1
0
def write_all_wiggles(f5_dirs1, f5_dirs2, corrected_group, basecall_subgroups,
                      stats_fn, wig_base, wig_types):
    if any(stat_name in wig_types
           for stat_name in ['stat', 'mt_stat', 'fraction']):
        if VERBOSE: sys.stderr.write('Loading statistics from file.\n')
        all_stats, stat_type = ts.parse_stats(stats_fn)

    if f5_dirs1 is not None:
        raw_read_coverage1 = th.parse_fast5s(f5_dirs1, corrected_group,
                                             basecall_subgroups)
        if len(raw_read_coverage1) == 0:
            sys.stderr.write(
                '*' * 60 + '\nERROR: No reads present in --fast5-basedirs.\n' +
                '*' * 60 + '\n')
            sys.exit()

    group1_name = '' if f5_dirs2 is None else GROUP1_NAME
    if f5_dirs2 is not None:
        raw_read_coverage2 = th.parse_fast5s(f5_dirs2, corrected_group,
                                             basecall_subgroups)
        chrm_sizes = th.get_chrm_sizes(raw_read_coverage1, raw_read_coverage2)

        if VERBOSE: sys.stderr.write('Writing wiggles.\n')
        if 'coverage' in wig_types:
            write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME)
        if 'signal_sd' in wig_types:
            write_signal_sd_wig(raw_read_coverage2, chrm_sizes, wig_base,
                                GROUP2_NAME)
        if 'length' in wig_types:
            write_length_wig(raw_read_coverage2, chrm_sizes, wig_base,
                             GROUP2_NAME)

        # need to do signal and difference call once either with or
        # w/o second set of files (unlike coverage, sds and length
        if 'signal' in wig_types or 'difference' in wig_types:
            write_signal_and_diff_wigs(raw_read_coverage1, raw_read_coverage2,
                                       chrm_sizes, wig_base, group1_name,
                                       'signal' in wig_types, 'difference'
                                       in wig_types)
    elif f5_dirs1 is not None:
        chrm_sizes = th.get_chrm_sizes(raw_read_coverage1)
        if VERBOSE: sys.stderr.write('Writing wiggles.\n')
        if 'signal' in wig_types:
            write_signal_and_diff_wigs(raw_read_coverage1, None, chrm_sizes,
                                       wig_base, group1_name, 'signal'
                                       in wig_types, False)

    if 'coverage' in wig_types:
        write_cov_wig(raw_read_coverage1, wig_base, group1_name)
    if 'signal_sd' in wig_types:
        write_signal_sd_wig(raw_read_coverage1, chrm_sizes, wig_base,
                            group1_name)
    if 'length' in wig_types:
        write_length_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name)
    if any(stat_name in wig_types
           for stat_name in ['stat', 'mt_stat', 'fraction']):
        write_stat_wigs(all_stats, wig_base, 'stat' in wig_types, 'mt_stat'
                        in wig_types, 'fraction' in wig_types, stat_type)

    return
def write_most_signif(f5_dirs, fasta_fn, num_regions, qval_thresh,
                      corrected_group, basecall_subgroups, seqs_fn, num_bases,
                      stat_order, stats_fn):
    if VERBOSE: sys.stderr.write('Loading statistics from file.\n')
    all_stats, stat_type = ts.parse_stats(stats_fn)
    plot_intervals = ts.get_most_signif_regions(all_stats,
                                                num_bases,
                                                num_regions,
                                                qval_thresh,
                                                fraction_order=not stat_order)

    # get each regions sequence either from reads or fasta index
    if fasta_fn is None:
        raw_read_coverage = th.parse_fast5s(f5_dirs, corrected_group,
                                            basecall_subgroups)
        all_reg_data = th.get_region_sequences(plot_intervals,
                                               raw_read_coverage)
    else:
        fasta_records = th.parse_fasta(fasta_fn)
        all_reg_data = [
            th.intervalData(int_i.reg_id, int_i.chrm, int_i.start, int_i.end,
                            int_i.strand, int_i.reg_text, int_i.reads,
                            fasta_records[int_i.chrm][int_i.start:int_i.end])
            for int_i in plot_intervals if int_i.chrm in fasta_records
        ]

    if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n')
    with open(seqs_fn, 'w') as seqs_fp:
        for int_i in all_reg_data:
            reg_seq = int_i.seq
            if int_i.strand == '-':
                reg_seq = th.rev_comp(reg_seq)
            seqs_fp.write('>{0}:{1:d}:{2} {3}\n{4}\n'.format(
                int_i.chrm, int(int_i.start + (num_bases / 2)), int_i.strand,
                int_i.reg_text, ''.join(reg_seq)))

    return