def main(key, remade=True): table_annotated = key + get_ending("annotation") output = get_results_file(key, 'BAD') with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file: d = json.load(read_file) rev_d = make_reverse_dict(d) badmap_file_name = rev_d[key] print('Now doing {} \n with BAD map file {}'.format(table_annotated, badmap_file_name)) badmap_file_path = create_badmaps_path_function(badmap_file_name, valid=remade) with open(badmap_file_path, 'r') as badmap_file, open(output, 'w') as out, open(table_annotated, 'r') as table_file: out.write(pack(['#chr', 'pos', 'ID', 'ref', 'alt', 'ref_read_counts', 'alt_read_counts', 'repeat_type'] + callers_names + ['BAD'] + ["Q{:.2f}".format(x) for x in segmentation_states] + ['SNP_count', 'sum_cover'])) u = UnpackBadSegments(None) for chr, pos, ID, ref, alt, ref_c, alt_c, repeat_type, in_callers, \ in_intersection, segment_BAD, segment_snps, segment_snp_ids,\ segment_sumcov, Qual in \ Intersection(table_file, badmap_file, write_segment_args=True, write_intersect=True, unpack_snp_function=lambda x: unpack(x, use_in='Pcounter'), unpack_segments_function=lambda x: u.unpack_bad_segments(x, segmentation_states)): if in_intersection and ID.startswith('rs'): out.write(pack([chr, pos, ID, ref, alt, ref_c, alt_c, repeat_type] + [in_callers[name] for name in callers_names] + [segment_BAD] + [Qual[x] for x in Qual] + [segment_snp_ids, segment_sumcov]))
def main(remake=False): bad_dataset_list = get_bad_dataset_list(remake=remake) print('Filtered {} datasets'.format(len(bad_dataset_list))) if not remake: print('iteration 1') new_dict, merged_dict = remake_badmaps_dict(bad_dataset_list) with open(get_new_badmaps_dict_path(), 'w') as f: json.dump(new_dict, f) with open(get_merged_badmaps_dict_path(), 'w') as f: json.dump(merged_dict, f) copy_good_badmaps(bad_dataset_list) else: print('iteration 2') delete_bad_badmaps(bad_dataset_list)
def collect_fixed_alt_statistics(master_df, key_name=None, BAD=None, suffix='', remade=True): out_t = None with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file: d = json.load(read_file) rev_d = make_reverse_dict(d) for index, row in master_df.iterrows(): if key_name is not None: if row['CELLS'] not in key_name: # <------ continue base_path = create_path_from_master_list_df(row) if not is_valid(base_path, rev_d, remade=remade): continue bad_table_path = create_path_from_master_list_df(row, 'BAD') if not os.path.isfile(bad_table_path): continue df = pd.read_table(bad_table_path) if df.empty: continue if BAD is not None: sum_df = df[df['BAD'] == BAD][[ 'ref_read_counts', 'alt_read_counts' ]] # <------ else: sum_df = df[['ref_read_counts', 'alt_read_counts']] if out_t is None: out_t = pd.DataFrame() out_t['alt_counts'] = sum_df['alt_read_counts'] out_t['ref_counts'] = sum_df['ref_read_counts'] out_t = out_t.groupby(['alt_counts', 'ref_counts' ]).size().reset_index(name='counts') out_t.fillna(0, inplace=True) else: tmp_df = pd.DataFrame() tmp_df['alt_counts'] = sum_df['alt_read_counts'] tmp_df['ref_counts'] = sum_df['ref_read_counts'] tmp_df = tmp_df.groupby(['alt_counts', 'ref_counts' ]).size().reset_index(name='counts') tmp_df.fillna(0, inplace=True) out_t = out_t.append(tmp_df).groupby(['alt_counts', 'ref_counts'], as_index=False).sum() if out_t is None: return out_t.to_csv(create_neg_bin_stats_path_function(BAD, suffix), sep="\t", index=False)
def main(what_for, remade=True): check_if_in_expected_args(what_for) aggregation_dict_path = get_aggregation_dict_path(what_for) with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file: old_rev_d = make_reverse_dict(json.load(read_file)) rev_d = { get_results_file(k, 'p-value', False): v for k, v in old_rev_d.items() } with open(aggregation_dict_path, 'r') as read_file: d = json.loads(read_file.readline()) with open(out_path, 'w') as file: for key in sorted(d.keys()): is_empty = True for value in d[key]: if os.path.isfile(value) and is_valid( split_ext_recursive(value), rev_d, remade=remade): is_empty = False if is_empty: continue file.write(key + '\n')
def main(for_what, remade=True): master_df = pd.read_table(master_list_path, dtype=dtype_dict) master_df = master_df[~master_df['EXP_TYPE']. isin(['chip_control', 'chipexo_control'])] master_df['path'] = master_df.apply(create_path_from_master_list_df, axis=1) master_df = master_df[master_df['path'].apply( lambda x: os.path.isfile(x + get_ending('vcf')))] if for_what == 'badmaps': with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file: d = json.load(read_file) rev_d = make_reverse_dict(d) master_df = master_df[master_df.apply( lambda row: os.path.isfile(row['path'] + get_ending( 'annotation')) and is_valid(row['path'], rev_d, remade=remade), axis=1)] master_df['path'].to_csv(out_path, sep='\t', index=False, header=False) elif for_what == 'annotation': master_df[['path', 'PEAKS']].to_csv(out_path, sep='\t', index=False, header=False)
def main(what_for, key_name, remade=True): check_if_in_expected_args(what_for) table_path = get_result_table_path(what_for, key_name) with open(get_merged_badmaps_dict_path(remade=remade), "r") as read_file: old_rev_d = make_reverse_dict(json.load(read_file)) rev_d = { get_results_file(k, 'p-value', False): v for k, v in old_rev_d.items() } tables = [] if what_for == "CL": tables = cell_lines_dict[key_name] if what_for == "TF": tables = tf_dict[key_name] print('Reading datasets for {} {}'.format(what_for, key_name)) common_snps = dict() for table in tables: if os.path.isfile(table) and is_valid( split_ext_recursive(table), rev_d, remade=remade): table_name = get_name(table) another_agr = get_another_agr(table, what_for) with open(table, 'r') as file: for line in file: try: (chromosome, pos, ID, ref, alt, ref_c, alt_c, repeat, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt) = unpack(line, use_in="Aggregation") except ValueError: if line.startswith('#'): continue else: raise if np.isnan(p_ref) or ID == '.': continue cov = ref_c + alt_c try: common_snps[(chromosome, pos, ID, ref, alt, repeat)].append( (cov, ref_c, alt_c, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt, table_name, another_agr)) except KeyError: common_snps[(chromosome, pos, ID, ref, alt, repeat)] = [ (cov, ref_c, alt_c, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt, table_name, another_agr) ] else: print("There is no {}".format(table)) print('Writing {}'.format(key_name)) with open(table_path, 'w') as out: out.write( pack([ '#chr', 'pos', 'ID', 'ref', 'alt', 'repeat_type', 'n_peak_calls', 'n_peak_callers', 'mean_BAD', 'mean_SNP_per_segment', 'n_aggregated', 'refc_mostsig_ref', 'altc_mostsig_ref', 'BAD_mostsig_ref', 'es_mostsig_ref', 'p_mostsig_ref', 'refc_mostsig_alt', 'altc_mostsig_alt', 'BAD_mostsig_alt', 'es_mostsig_alt', 'p_mostsig_alt', 'min_cover', 'max_cover', 'median_cover', 'total_cover', 'es_mean_ref', 'es_mean_alt', 'logitp_ref', 'logitp_alt' ])) SNP_counter = 0 print('{} snps'.format(len(common_snps))) if len(common_snps) == 0: os.remove(table_path) sys.exit(0) origin_of_snp_dict = OrderedDict() keys = list(common_snps.keys()) keys = sorted(keys, key=lambda chr_pos: chr_pos[1]) keys = sorted(keys, key=lambda chr_pos: chr_pos[0]) for key in keys: chromosome, pos, ID, ref, alt, repeat = key value = common_snps[key] SNP_counter += 1 if SNP_counter % 10000 == 0: print('done {}'.format(SNP_counter)) unique_callers_counter = dict( zip(callers_names, [False] * len(callers_names))) total_callers_counter = 0 BAD_array = [] SNPs_per_segment_array = [] p_ref_array = [] p_alt_array = [] cover_array = [] ref_effect_size_array = [] alt_effect_size_array = [] table_names_array = [] another_agr_name = [] ref_counts_array = [] alt_counts_array = [] for v in value: cov, ref_c, alt_c, in_callers, BAD, Quals, seg_c, sum_cov, p_ref, p_alt, es_ref, es_alt, table_name, \ another_agr = v table_names_array.append(table_name) another_agr_name.append(another_agr) for caller in callers_names: unique_callers_counter[caller] = unique_callers_counter[ caller] or in_callers[caller] total_callers_counter += in_callers[caller] BAD_array.append(BAD) SNPs_per_segment_array.append(seg_c) p_ref_array.append(p_ref) p_alt_array.append(p_alt) if not np.isnan(es_ref): ref_effect_size_array.append(es_ref / np.log(2)) if not np.isnan(es_alt): alt_effect_size_array.append(es_alt / np.log(2)) cover_array.append(cov) ref_counts_array.append(ref_c) alt_counts_array.append(alt_c) p = 1 / (BAD + 1) min_cover = min(cover_array) max_cover = max(cover_array) med_cover = median_grouped(cover_array) total_cover = sum(cover_array) unique_callers = sum(unique_callers_counter[caller] for caller in callers_names) mean_BAD = np.round(np.mean(BAD_array), 2) mean_SNPs_per_segment = np.round(np.mean(SNPs_per_segment_array), 1) n_aggregated = len(value) logitp_ref = logit_combine_p_values(p_ref_array) logitp_palt = logit_combine_p_values(p_alt_array) if ref_effect_size_array: weights = [-1 * np.log10(x) for x in p_ref_array if x != 1] es_mean_ref = np.round( np.average(ref_effect_size_array, weights=weights), 3) es_mostsig_ref = ref_effect_size_array[int(np.argmax(weights))] idx = int(np.argmax([-x for x in p_ref_array])) p_mostsig_ref = p_ref_array[idx] ref_c_mostsig_ref = ref_counts_array[idx] alt_c_mostsig_ref = alt_counts_array[idx] BAD_mostsig_ref = BAD_array[idx] else: es_mean_ref = 'NaN' es_mostsig_ref = 'NaN' ref_c_mostsig_ref = 'NaN' p_mostsig_ref = 'NaN' alt_c_mostsig_ref = 'NaN' BAD_mostsig_ref = 'NaN' if alt_effect_size_array: weights = [-1 * np.log10(x) for x in p_alt_array if x != 1] es_mean_alt = np.round( np.average(alt_effect_size_array, weights=weights), 3) es_mostsig_alt = alt_effect_size_array[int(np.argmax(weights))] idx = int(np.argmax([-x for x in p_alt_array])) p_mostsig_alt = p_alt_array[idx] ref_c_mostsig_alt = ref_counts_array[idx] alt_c_mostsig_alt = alt_counts_array[idx] BAD_mostsig_alt = BAD_array[idx] else: es_mean_alt = 'NaN' es_mostsig_alt = 'NaN' ref_c_mostsig_alt = 'NaN' p_mostsig_alt = 'NaN' alt_c_mostsig_alt = 'NaN' BAD_mostsig_alt = 'NaN' out.write( pack([ chromosome, pos, ID, ref, alt, repeat, total_callers_counter, unique_callers, mean_BAD, mean_SNPs_per_segment, n_aggregated, ref_c_mostsig_ref, alt_c_mostsig_ref, BAD_mostsig_ref, es_mostsig_ref, p_mostsig_ref, ref_c_mostsig_alt, alt_c_mostsig_alt, BAD_mostsig_alt, es_mostsig_alt, p_mostsig_alt, min_cover, max_cover, med_cover, total_cover, es_mean_ref, es_mean_alt, logitp_ref, logitp_palt ])) origin_of_snp_dict["\t".join(map(str, key))] = { 'aligns': table_names_array, expected_args[what_for]: another_agr_name, 'ref_counts': ref_counts_array, 'alt_counts': alt_counts_array, 'ref_ef': ref_effect_size_array, 'alt_ef': alt_effect_size_array, 'BAD': BAD_array, 'ref_pvalues': p_ref_array, 'alt_pvalues': p_alt_array, } print("Counting FDR") table = pd.read_table(table_path) if table.empty: os.remove(table_path) sys.exit(0) mc_filter_array = np.array(table['max_cover'] >= 20) if sum(mc_filter_array) != 0: bool_ar_ref, p_val_ref, _, _ = statsmodels.stats.multitest.multipletests( table[mc_filter_array]["logitp_ref"], alpha=0.05, method='fdr_bh') bool_ar_alt, p_val_alt, _, _ = statsmodels.stats.multitest.multipletests( table[mc_filter_array]["logitp_alt"], alpha=0.05, method='fdr_bh') else: p_val_ref = [] p_val_alt = [] bool_ar_ref = [] bool_ar_alt = [] fdr_by_ref = np.array(['NaN'] * len(table.index), dtype=np.float128) fdr_by_ref[mc_filter_array] = p_val_ref table["fdrp_bh_ref"] = fdr_by_ref fdr_by_alt = np.array(['NaN'] * len(table.index), dtype=np.float128) fdr_by_alt[mc_filter_array] = p_val_alt table["fdrp_bh_alt"] = fdr_by_alt table.to_csv(table_path, sep="\t", index=False) bool_ar = np.array([False] * len(table.index), dtype=np.bool) bool_ar[mc_filter_array] = bool_ar_alt + bool_ar_ref with open( os.path.join(results_path, what_for + '_DICTS/{}.json'.format(key_name)), 'w') as out: json.dump(origin_of_snp_dict, out)