def main(key, remade=True):
    table_annotated = key + get_ending("annotation")
    output = get_results_file(key, 'BAD')

    with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file:
        d = json.load(read_file)
        rev_d = make_reverse_dict(d)

    badmap_file_name = rev_d[key]

    print('Now doing {} \n with BAD map file {}'.format(table_annotated, badmap_file_name))
    badmap_file_path = create_badmaps_path_function(badmap_file_name, valid=remade)
    with open(badmap_file_path, 'r') as badmap_file, open(output, 'w') as out, open(table_annotated, 'r') as table_file:
        out.write(pack(['#chr', 'pos', 'ID', 'ref', 'alt', 'ref_read_counts', 'alt_read_counts',
                        'repeat_type'] + callers_names + ['BAD'] + ["Q{:.2f}".format(x) for x in segmentation_states] +
                       ['SNP_count', 'sum_cover']))

        u = UnpackBadSegments(None)
        for chr, pos, ID, ref, alt, ref_c, alt_c, repeat_type, in_callers, \
            in_intersection, segment_BAD, segment_snps, segment_snp_ids,\
                    segment_sumcov, Qual in \
                Intersection(table_file, badmap_file, write_segment_args=True, write_intersect=True,
                             unpack_snp_function=lambda x: unpack(x, use_in='Pcounter'),
                             unpack_segments_function=lambda x: u.unpack_bad_segments(x, segmentation_states)):
            if in_intersection and ID.startswith('rs'):
                out.write(pack([chr, pos, ID, ref, alt, ref_c, alt_c, repeat_type] +
                               [in_callers[name] for name in callers_names] +
                               [segment_BAD] + [Qual[x] for x in Qual] + [segment_snp_ids, segment_sumcov]))
def main(remake=False):
    bad_dataset_list = get_bad_dataset_list(remake=remake)
    print('Filtered {} datasets'.format(len(bad_dataset_list)))
    if not remake:
        print('iteration 1')
        new_dict, merged_dict = remake_badmaps_dict(bad_dataset_list)
        with open(get_new_badmaps_dict_path(), 'w') as f:
            json.dump(new_dict, f)
        with open(get_merged_badmaps_dict_path(), 'w') as f:
            json.dump(merged_dict, f)
        copy_good_badmaps(bad_dataset_list)
    else:
        print('iteration 2')
        delete_bad_badmaps(bad_dataset_list)
def collect_fixed_alt_statistics(master_df,
                                 key_name=None,
                                 BAD=None,
                                 suffix='',
                                 remade=True):
    out_t = None
    with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file:
        d = json.load(read_file)
        rev_d = make_reverse_dict(d)
    for index, row in master_df.iterrows():
        if key_name is not None:
            if row['CELLS'] not in key_name:  # <------
                continue
        base_path = create_path_from_master_list_df(row)
        if not is_valid(base_path, rev_d, remade=remade):
            continue
        bad_table_path = create_path_from_master_list_df(row, 'BAD')
        if not os.path.isfile(bad_table_path):
            continue
        df = pd.read_table(bad_table_path)
        if df.empty:
            continue
        if BAD is not None:
            sum_df = df[df['BAD'] == BAD][[
                'ref_read_counts', 'alt_read_counts'
            ]]  # <------
        else:
            sum_df = df[['ref_read_counts', 'alt_read_counts']]

        if out_t is None:
            out_t = pd.DataFrame()
            out_t['alt_counts'] = sum_df['alt_read_counts']
            out_t['ref_counts'] = sum_df['ref_read_counts']
            out_t = out_t.groupby(['alt_counts', 'ref_counts'
                                   ]).size().reset_index(name='counts')
            out_t.fillna(0, inplace=True)
        else:
            tmp_df = pd.DataFrame()
            tmp_df['alt_counts'] = sum_df['alt_read_counts']
            tmp_df['ref_counts'] = sum_df['ref_read_counts']
            tmp_df = tmp_df.groupby(['alt_counts', 'ref_counts'
                                     ]).size().reset_index(name='counts')
            tmp_df.fillna(0, inplace=True)
            out_t = out_t.append(tmp_df).groupby(['alt_counts', 'ref_counts'],
                                                 as_index=False).sum()
    if out_t is None:
        return
    out_t.to_csv(create_neg_bin_stats_path_function(BAD, suffix),
                 sep="\t",
                 index=False)
def main(what_for, remade=True):
    check_if_in_expected_args(what_for)
    aggregation_dict_path = get_aggregation_dict_path(what_for)
    with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file:
        old_rev_d = make_reverse_dict(json.load(read_file))
        rev_d = {
            get_results_file(k, 'p-value', False): v
            for k, v in old_rev_d.items()
        }
    with open(aggregation_dict_path, 'r') as read_file:
        d = json.loads(read_file.readline())
    with open(out_path, 'w') as file:
        for key in sorted(d.keys()):
            is_empty = True
            for value in d[key]:
                if os.path.isfile(value) and is_valid(
                        split_ext_recursive(value), rev_d, remade=remade):
                    is_empty = False
            if is_empty:
                continue
            file.write(key + '\n')
def main(for_what, remade=True):
    master_df = pd.read_table(master_list_path, dtype=dtype_dict)
    master_df = master_df[~master_df['EXP_TYPE'].
                          isin(['chip_control', 'chipexo_control'])]
    master_df['path'] = master_df.apply(create_path_from_master_list_df,
                                        axis=1)
    master_df = master_df[master_df['path'].apply(
        lambda x: os.path.isfile(x + get_ending('vcf')))]
    if for_what == 'badmaps':
        with open(get_merged_badmaps_dict_path(remade=remade),
                  "r") as read_file:
            d = json.load(read_file)
            rev_d = make_reverse_dict(d)
        master_df = master_df[master_df.apply(
            lambda row: os.path.isfile(row['path'] + get_ending(
                'annotation')) and is_valid(row['path'], rev_d, remade=remade),
            axis=1)]
        master_df['path'].to_csv(out_path, sep='\t', index=False, header=False)
    elif for_what == 'annotation':
        master_df[['path', 'PEAKS']].to_csv(out_path,
                                            sep='\t',
                                            index=False,
                                            header=False)
Beispiel #6
0
def main(what_for, key_name, remade=True):
    check_if_in_expected_args(what_for)

    table_path = get_result_table_path(what_for, key_name)

    with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file:
        old_rev_d = make_reverse_dict(json.load(read_file))
        rev_d = {
            get_results_file(k, 'p-value', False): v
            for k, v in old_rev_d.items()
        }

    tables = []
    if what_for == "CL":
        tables = cell_lines_dict[key_name]
    if what_for == "TF":
        tables = tf_dict[key_name]
    print('Reading datasets for {} {}'.format(what_for, key_name))
    common_snps = dict()
    for table in tables:
        if os.path.isfile(table) and is_valid(
                split_ext_recursive(table), rev_d, remade=remade):
            table_name = get_name(table)
            another_agr = get_another_agr(table, what_for)
            with open(table, 'r') as file:
                for line in file:
                    try:
                        (chromosome, pos, ID, ref, alt, ref_c, alt_c, repeat,
                         in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt,
                         es_ref, es_alt) = unpack(line, use_in="Aggregation")
                    except ValueError:
                        if line.startswith('#'):
                            continue
                        else:
                            raise
                    if np.isnan(p_ref) or ID == '.':
                        continue
                    cov = ref_c + alt_c

                    try:
                        common_snps[(chromosome, pos, ID, ref, alt,
                                     repeat)].append(
                                         (cov, ref_c, alt_c, in_callers, BAD,
                                          Quals, seg_c, sum_cov, p_ref, p_alt,
                                          es_ref, es_alt, table_name,
                                          another_agr))
                    except KeyError:
                        common_snps[(chromosome, pos, ID, ref, alt,
                                     repeat)] = [
                                         (cov, ref_c, alt_c, in_callers, BAD,
                                          Quals, seg_c, sum_cov, p_ref, p_alt,
                                          es_ref, es_alt, table_name,
                                          another_agr)
                                     ]
        else:
            print("There is no {}".format(table))
    print('Writing {}'.format(key_name))

    with open(table_path, 'w') as out:
        out.write(
            pack([
                '#chr', 'pos', 'ID', 'ref', 'alt', 'repeat_type',
                'n_peak_calls', 'n_peak_callers', 'mean_BAD',
                'mean_SNP_per_segment', 'n_aggregated', 'refc_mostsig_ref',
                'altc_mostsig_ref', 'BAD_mostsig_ref', 'es_mostsig_ref',
                'p_mostsig_ref', 'refc_mostsig_alt', 'altc_mostsig_alt',
                'BAD_mostsig_alt', 'es_mostsig_alt', 'p_mostsig_alt',
                'min_cover', 'max_cover', 'median_cover', 'total_cover',
                'es_mean_ref', 'es_mean_alt', 'logitp_ref', 'logitp_alt'
            ]))

        SNP_counter = 0
        print('{} snps'.format(len(common_snps)))

        if len(common_snps) == 0:
            os.remove(table_path)
            sys.exit(0)
        origin_of_snp_dict = OrderedDict()
        keys = list(common_snps.keys())
        keys = sorted(keys, key=lambda chr_pos: chr_pos[1])
        keys = sorted(keys, key=lambda chr_pos: chr_pos[0])
        for key in keys:
            chromosome, pos, ID, ref, alt, repeat = key
            value = common_snps[key]
            SNP_counter += 1
            if SNP_counter % 10000 == 0:
                print('done {}'.format(SNP_counter))
            unique_callers_counter = dict(
                zip(callers_names, [False] * len(callers_names)))
            total_callers_counter = 0
            BAD_array = []
            SNPs_per_segment_array = []
            p_ref_array = []
            p_alt_array = []
            cover_array = []
            ref_effect_size_array = []
            alt_effect_size_array = []
            table_names_array = []
            another_agr_name = []
            ref_counts_array = []
            alt_counts_array = []

            for v in value:
                cov, ref_c, alt_c, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt, table_name, \
                another_agr = v

                table_names_array.append(table_name)
                another_agr_name.append(another_agr)
                for caller in callers_names:
                    unique_callers_counter[caller] = unique_callers_counter[
                        caller] or in_callers[caller]
                    total_callers_counter += in_callers[caller]
                BAD_array.append(BAD)
                SNPs_per_segment_array.append(seg_c)
                p_ref_array.append(p_ref)
                p_alt_array.append(p_alt)
                if not np.isnan(es_ref):
                    ref_effect_size_array.append(es_ref / np.log(2))
                if not np.isnan(es_alt):
                    alt_effect_size_array.append(es_alt / np.log(2))
                cover_array.append(cov)

                ref_counts_array.append(ref_c)
                alt_counts_array.append(alt_c)
                p = 1 / (BAD + 1)

            min_cover = min(cover_array)
            max_cover = max(cover_array)
            med_cover = median_grouped(cover_array)
            total_cover = sum(cover_array)
            unique_callers = sum(unique_callers_counter[caller]
                                 for caller in callers_names)
            mean_BAD = np.round(np.mean(BAD_array), 2)
            mean_SNPs_per_segment = np.round(np.mean(SNPs_per_segment_array),
                                             1)
            n_aggregated = len(value)

            logitp_ref = logit_combine_p_values(p_ref_array)
            logitp_palt = logit_combine_p_values(p_alt_array)

            if ref_effect_size_array:
                weights = [-1 * np.log10(x) for x in p_ref_array if x != 1]
                es_mean_ref = np.round(
                    np.average(ref_effect_size_array, weights=weights), 3)
                es_mostsig_ref = ref_effect_size_array[int(np.argmax(weights))]
                idx = int(np.argmax([-x for x in p_ref_array]))
                p_mostsig_ref = p_ref_array[idx]
                ref_c_mostsig_ref = ref_counts_array[idx]
                alt_c_mostsig_ref = alt_counts_array[idx]
                BAD_mostsig_ref = BAD_array[idx]
            else:
                es_mean_ref = 'NaN'
                es_mostsig_ref = 'NaN'
                ref_c_mostsig_ref = 'NaN'
                p_mostsig_ref = 'NaN'
                alt_c_mostsig_ref = 'NaN'
                BAD_mostsig_ref = 'NaN'

            if alt_effect_size_array:
                weights = [-1 * np.log10(x) for x in p_alt_array if x != 1]
                es_mean_alt = np.round(
                    np.average(alt_effect_size_array, weights=weights), 3)
                es_mostsig_alt = alt_effect_size_array[int(np.argmax(weights))]
                idx = int(np.argmax([-x for x in p_alt_array]))
                p_mostsig_alt = p_alt_array[idx]
                ref_c_mostsig_alt = ref_counts_array[idx]
                alt_c_mostsig_alt = alt_counts_array[idx]
                BAD_mostsig_alt = BAD_array[idx]
            else:
                es_mean_alt = 'NaN'
                es_mostsig_alt = 'NaN'
                ref_c_mostsig_alt = 'NaN'
                p_mostsig_alt = 'NaN'
                alt_c_mostsig_alt = 'NaN'
                BAD_mostsig_alt = 'NaN'

            out.write(
                pack([
                    chromosome, pos, ID, ref, alt, repeat,
                    total_callers_counter, unique_callers, mean_BAD,
                    mean_SNPs_per_segment, n_aggregated, ref_c_mostsig_ref,
                    alt_c_mostsig_ref, BAD_mostsig_ref, es_mostsig_ref,
                    p_mostsig_ref, ref_c_mostsig_alt, alt_c_mostsig_alt,
                    BAD_mostsig_alt, es_mostsig_alt, p_mostsig_alt, min_cover,
                    max_cover, med_cover, total_cover, es_mean_ref,
                    es_mean_alt, logitp_ref, logitp_palt
                ]))
            origin_of_snp_dict["\t".join(map(str, key))] = {
                'aligns': table_names_array,
                expected_args[what_for]: another_agr_name,
                'ref_counts': ref_counts_array,
                'alt_counts': alt_counts_array,
                'ref_ef': ref_effect_size_array,
                'alt_ef': alt_effect_size_array,
                'BAD': BAD_array,
                'ref_pvalues': p_ref_array,
                'alt_pvalues': p_alt_array,
            }

    print("Counting FDR")

    table = pd.read_table(table_path)
    if table.empty:
        os.remove(table_path)
        sys.exit(0)

    mc_filter_array = np.array(table['max_cover'] >= 20)
    if sum(mc_filter_array) != 0:
        bool_ar_ref, p_val_ref, _, _ = statsmodels.stats.multitest.multipletests(
            table[mc_filter_array]["logitp_ref"], alpha=0.05, method='fdr_bh')
        bool_ar_alt, p_val_alt, _, _ = statsmodels.stats.multitest.multipletests(
            table[mc_filter_array]["logitp_alt"], alpha=0.05, method='fdr_bh')
    else:
        p_val_ref = []
        p_val_alt = []
        bool_ar_ref = []
        bool_ar_alt = []

    fdr_by_ref = np.array(['NaN'] * len(table.index), dtype=np.float128)
    fdr_by_ref[mc_filter_array] = p_val_ref
    table["fdrp_bh_ref"] = fdr_by_ref

    fdr_by_alt = np.array(['NaN'] * len(table.index), dtype=np.float128)
    fdr_by_alt[mc_filter_array] = p_val_alt
    table["fdrp_bh_alt"] = fdr_by_alt

    table.to_csv(table_path, sep="\t", index=False)

    bool_ar = np.array([False] * len(table.index), dtype=np.bool)
    bool_ar[mc_filter_array] = bool_ar_alt + bool_ar_ref

    with open(
            os.path.join(results_path,
                         what_for + '_DICTS/{}.json'.format(key_name)),
            'w') as out:
        json.dump(origin_of_snp_dict, out)