def load_entropy(config): suffix = '' if bool(config.experiment.data_params): data_params = copy.deepcopy(config.experiment.data_params) suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for entropy is empty.') fn_data = get_cache_path(config) + '/' + 'entropy' + suffix + '.npz' config.entropy_list = ['entropy'] config.entropy_dict = {'entropy': 0} config.entropy_missed_dict = {'entropy': []} if os.path.isfile(fn_data): data = np.load(fn_data) config.entropy_data = data['data'] else: if data_params['data'] == 'betas': config.experiment.data_params = {} load_betas(config) data = config.betas_data data_dict = config.betas_dict elif data_params['data'] == 'betas_adj': config.experiment.data_params.pop('data') load_betas_adj(config) data = config.betas_adj_data data_dict = config.betas_adj_dict elif data_params['data'] == 'residuals': config.experiment.data_params.pop('data') load_residuals(config) data = config.residuals_data data_dict = config.residuals_dict else: raise ValueError('Unsupported data for entropy.') num_subjects = data.shape[1] config.entropy_data = np.zeros(num_subjects, dtype=np.float32) rows = [data_dict[item] for item in config.cpg_list if item in data_dict] for subj_id in tqdm(range(0, num_subjects), mininterval=60.0, desc='entropy_data creating'): values = np.squeeze(np.asarray(data[np.ix_(rows, [subj_id])])) entropy = 0.0 outliers = 0 for val in values: if not math.isnan(val): if 0.0 < val < 1.0: entropy += val * np.log2(val) + (1.0 - val) * np.log2(1.0 - val) else: outliers += 1 else: outliers += 1 entropy /= ((len(values) - outliers) * np.log2(0.5)) config.entropy_data[subj_id] = entropy np.savez_compressed(fn_data, data=config.entropy_data)
def subset_annotations(config): aux_data_fn = get_cache_path(config) + '/' + 'aux_data.pkl' if os.path.isfile(aux_data_fn): f = open(aux_data_fn, 'rb') aux_data = pickle.load(f) f.close() config.cpg_list = aux_data['cpg_list'] config.cpg_gene_dict = aux_data['cpg_gene_dict'] config.cpg_bop_dict = aux_data['cpg_bop_dict'] config.gene_cpg_dict = aux_data['gene_cpg_dict'] config.gene_bop_dict = aux_data['gene_bop_dict'] config.bop_cpg_dict = aux_data['bop_cpg_dict'] config.bop_gene_dict = aux_data['bop_gene_dict'] else: config.cpg_list = [] config.cpg_gene_dict = {} config.cpg_bop_dict = {} config.gene_cpg_dict = {} config.gene_bop_dict = {} config.bop_cpg_dict = {} config.bop_gene_dict = {} cpgs = config.annotations_dict[AnnotationKey.cpg.value] genes = config.annotations_dict[AnnotationKey.gene.value] bops = config.annotations_dict[AnnotationKey.bop.value] map_infos = config.annotations_dict[AnnotationKey.map_info.value] for id in range(0, len(cpgs)): if id % 10000 == 0: print('id: ' + str(id)) curr_ann_dict = {} for key in config.annotations_dict: curr_ann_dict[key] = config.annotations_dict[key][id] if check_conditions(config, curr_ann_dict): cpg = cpgs[id] gene_raw = genes[id] curr_genes = list(set(gene_raw.split(';'))) bop = bops[id] config.cpg_list.append(cpg) config.cpg_gene_dict[cpg] = curr_genes config.cpg_bop_dict[cpg] = bop for gene in curr_genes: if gene in config.gene_cpg_dict: config.gene_cpg_dict[gene].append(cpg) else: config.gene_cpg_dict[gene] = [cpg] for gene in curr_genes: if gene in config.gene_bop_dict: config.gene_bop_dict[gene].append(bop) else: config.gene_bop_dict[gene] = [bop] if len(bop) > 0: if bop in config.bop_cpg_dict: config.bop_cpg_dict[bop].append(cpg) else: config.bop_cpg_dict[bop] = [cpg] config.bop_gene_dict[bop] = curr_genes # Sorting cpgs by map_info in gene dict for curr_gene, curr_cpgs in config.gene_cpg_dict.items(): curr_map_infos = [] for curr_cpg in curr_cpgs: cpg_index = cpgs.index(curr_cpg) curr_map_infos.append(int(map_infos[cpg_index])) order = np.argsort(curr_map_infos) curr_cpgs_sorted = list(np.array(curr_cpgs)[order]) config.gene_cpg_dict[curr_gene] = curr_cpgs_sorted # Sorting cpgs by map_info in bop dict for curr_bop, curr_cpgs in config.bop_cpg_dict.items(): curr_map_infos = [] for curr_cpg in curr_cpgs: cpg_index = cpgs.index(curr_cpg) curr_map_infos.append(int(map_infos[cpg_index])) order = np.argsort(curr_map_infos) curr_cpgs_sorted = list(np.array(curr_cpgs)[order]) config.bop_cpg_dict[curr_bop] = curr_cpgs_sorted aux_data = { 'cpg_list': config.cpg_list, 'cpg_gene_dict': config.cpg_gene_dict, 'cpg_bop_dict': config.cpg_bop_dict, 'gene_cpg_dict': config.gene_cpg_dict, 'gene_bop_dict': config.gene_bop_dict, 'bop_cpg_dict': config.bop_cpg_dict, 'bop_gene_dict': config.bop_gene_dict, } f = open(aux_data_fn, 'wb') pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL) f.close()
def load_genes(config): suffix_gene = '' if bool(config.experiment.data_params): suffix_gene += '_' + str(config.experiment.get_data_params_str()) source = config.experiment.data_params.pop('source') else: raise ValueError('Data params for genes are empty') fn_list_txt = get_cache_path(config) + '/' + 'genes_list.txt' fn_list_pkl = get_cache_path(config) + '/' + 'genes_list.pkl' fn_dict_pkl = get_cache_path(config) + '/' + 'genes_dict.pkl' fn_missed_dict_pkl = get_cache_path(config) + '/' + 'genes_missed_dict.pkl' fn_data_npz = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.npz' fn_data_txt = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.txt' if os.path.isfile(fn_dict_pkl) and os.path.isfile( fn_list_pkl) and os.path.isfile(fn_data_npz): f = open(fn_list_pkl, 'rb') config.genes_list = pickle.load(f) f.close() f = open(fn_dict_pkl, 'rb') config.genes_dict = pickle.load(f) f.close() f = open(fn_missed_dict_pkl, 'rb') config.genes_missed_dict = pickle.load(f) f.close() data = np.load(fn_data_npz) config.genes_data = data['data'] else: if source == 'betas': load_betas(config) source_dict = config.betas_dict source_data = config.betas_data source_missed_dict = config.betas_missed_dict elif source == 'betas_adj': load_betas_adj(config) source_dict = config.betas_adj_dict source_data = config.betas_adj_data source_missed_dict = config.betas_adj_missed_dict elif source == 'residuals': load_residuals(config) source_dict = config.residuals_dict source_data = config.residuals_data source_missed_dict = config.residuals_missed_dict else: raise ValueError('Source for genes is not specified') num_subjects = config.betas_data.shape[1] config.genes_list = [] for gene_id, gene in tqdm(enumerate(config.gene_cpg_dict), mininterval=60.0, desc='genes_list creating'): cpgs = config.gene_cpg_dict[gene] for cpg in cpgs: if cpg in source_dict: config.genes_list.append(gene) break config.genes_dict = {} config.genes_missed_dict = {'any': []} config.genes_data = np.zeros((len(config.genes_list), num_subjects), dtype=np.float32) for gene_id, gene in tqdm(enumerate(config.genes_list), mininterval=60.0, desc='genes_data creating'): config.genes_dict[gene] = gene_id cpgs = config.gene_cpg_dict[gene] denominators = np.zeros(num_subjects, dtype=np.float32) for cpg in cpgs: if cpg in source_dict: row_id = source_dict[cpg] source_values_raw = source_data[row_id, :] if len(source_missed_dict[cpg]) > 0: source_values = np.zeros(num_subjects, dtype=np.float32) for val_id in range(0, num_subjects): if val_id not in source_missed_dict[cpg]: source_values[val_id] = source_values_raw[ val_id] denominators[val_id] += 1.0 else: source_values = source_values_raw denominators += np.ones(num_subjects, dtype=np.float32) config.genes_data[gene_id] += source_values for val_id in range(0, num_subjects): config.genes_data[gene_id][val_id] /= denominators[val_id] f = open(fn_list_pkl, 'wb') pickle.dump(config.genes_list, f, pickle.HIGHEST_PROTOCOL) f.close() f = open(fn_dict_pkl, 'wb') pickle.dump(config.genes_dict, f, pickle.HIGHEST_PROTOCOL) f.close() f = open(fn_missed_dict_pkl, 'wb') pickle.dump(config.genes_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() np.savez_compressed(fn_data_npz, data=config.genes_data) np.savetxt(fn_data_txt, config.genes_data, delimiter='\t', fmt='%.8e') with open(fn_list_txt, 'w') as f: for item in config.genes_list: f.write("%s\n" % item)
def subset_annotations(config): aux_data_fn = get_cache_path(config) + '/' + 'aux_data.pkl' if config.annotations.type == '450k': if os.path.isfile(aux_data_fn): f = open(aux_data_fn, 'rb') aux_data = pickle.load(f) f.close() config.cpg_list = aux_data['cpg_list'] config.cpg_gene_dict = aux_data['cpg_gene_dict'] config.cpg_bop_dict = aux_data['cpg_bop_dict'] config.gene_cpg_dict = aux_data['gene_cpg_dict'] config.gene_bop_dict = aux_data['gene_bop_dict'] config.bop_cpg_dict = aux_data['bop_cpg_dict'] config.bop_gene_dict = aux_data['bop_gene_dict'] config.cpg_map_info_dict = aux_data['cpg_map_info_dict'] else: config.cpg_list = [] config.cpg_gene_dict = {} config.cpg_bop_dict = {} config.gene_cpg_dict = {} config.gene_bop_dict = {} config.bop_cpg_dict = {} config.bop_gene_dict = {} config.cpg_map_info_dict = {} cpgs_all = config.annotations_dict[config.annotations.id_name] genes_all = config.annotations_dict['UCSC_REFGENE_NAME'] bops_all = config.annotations_dict['BOP'] map_infos_all = config.annotations_dict['MAPINFO'] for index, cpg in enumerate(cpgs_all): if global_check(config, index): cpg = cpgs_all[index][0] config.cpg_list.append(cpg) map_info = map_infos_all[index][0] if map_info == 'NA': map_info = '0' config.cpg_map_info_dict[cpg] = int(map_info) genes = genes_all[index] if len(genes) > 0: config.cpg_gene_dict[cpg] = genes for gene in genes: if gene in config.gene_cpg_dict: config.gene_cpg_dict[gene].append(cpg) else: config.gene_cpg_dict[gene] = [cpg] bops = bops_all[index] if len(bops) > 0: config.cpg_bop_dict[cpg] = bops for bop in bops: if bop in config.bop_cpg_dict: config.bop_cpg_dict[bop].append(cpg) else: config.bop_cpg_dict[bop] = [cpg] if len(genes) > 0 and len(bops) > 0: for gene in genes: if gene in config.gene_bop_dict: config.gene_bop_dict[gene] += bops else: config.gene_bop_dict[gene] = copy.deepcopy( bops) for bop in bops: if bop in config.bop_gene_dict: config.bop_gene_dict[bop] += genes else: config.bop_gene_dict[bop] = copy.deepcopy( genes) # Sorting cpgs by map_info in gene dict for gene, cpgs in config.gene_cpg_dict.items(): map_infos = [] for cpg in cpgs: map_infos.append(int(config.cpg_map_info_dict[cpg])) order = np.argsort(map_infos) cpgs_sorted = list(np.array(cpgs)[order]) config.gene_cpg_dict[gene] = cpgs_sorted # Sorting cpgs by map_info in bop dict for bop, cpgs in config.bop_cpg_dict.items(): map_infos = [] for cpg in cpgs: map_infos.append(int(config.cpg_map_info_dict[cpg])) order = np.argsort(map_infos) cpgs_sorted = list(np.array(cpgs)[order]) config.bop_cpg_dict[bop] = cpgs_sorted aux_data = { 'cpg_list': config.cpg_list, 'cpg_gene_dict': config.cpg_gene_dict, 'cpg_bop_dict': config.cpg_bop_dict, 'gene_cpg_dict': config.gene_cpg_dict, 'gene_bop_dict': config.gene_bop_dict, 'bop_cpg_dict': config.bop_cpg_dict, 'bop_gene_dict': config.bop_gene_dict, 'cpg_map_info_dict': config.cpg_map_info_dict } f = open(aux_data_fn, 'wb') pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL) f.close() elif config.annotations.type == '850k': if os.path.isfile(aux_data_fn): f = open(aux_data_fn, 'rb') aux_data = pickle.load(f) f.close() config.cpg_list = aux_data['cpg_list'] config.cpg_gene_dict = aux_data['cpg_gene_dict'] config.gene_cpg_dict = aux_data['gene_cpg_dict'] config.cpg_map_info_dict = aux_data['cpg_map_info_dict'] config.bops = aux_data['bops'] else: config.cpg_list = [] config.cpg_gene_dict = {} config.gene_cpg_dict = {} config.cpg_map_info_dict = {} cpgs_all = config.annotations_dict[config.annotations.id_name] genes_all = config.annotations_dict['UCSC_RefGene_Name'] map_infos_all = config.annotations_dict['MAPINFO'] chr_all = config.annotations_dict['CHR'] geo_all = config.annotations_dict['UCSC_CpG_Islands_Name'] geo_type_all = config.annotations_dict[ 'Relation_to_UCSC_CpG_Island'] config.bops = {} for index, cpg in enumerate(cpgs_all): if global_check(config, index): cpg = cpgs_all[index][0] config.cpg_list.append(cpg) map_info = map_infos_all[index] if len(map_info) > 0: map_info = map_info[0] if map_info == 'NA': map_info = '0' else: map_info = 0 config.cpg_map_info_dict[cpg] = int(map_info) genes = genes_all[index] genes.sort() if len(genes) > 0: config.cpg_gene_dict[cpg] = genes for gene in genes: if gene in config.gene_cpg_dict: config.gene_cpg_dict[gene].append(cpg) else: config.gene_cpg_dict[gene] = [cpg] if len(chr_all[index]) > 0: chr = chr_all[index][0] else: chr = '' if len(geo_all[index]) > 0: geo = geo_all[index][0] else: geo = '' if len(geo_type_all[index]) > 0: geo_type = geo_type_all[index][0] else: geo_type = '' if geo == '': if len(genes) > 0: bop_class = 'C' bop_names = [f'{chr}*{gene}' for gene in genes] else: bop_class = 'D' bop_names = [cpg] else: if len(genes) > 0: bop_class = 'A' bop_names = [f'{geo}*{geo_type}'] else: bop_class = 'B' bop_names = [f'{geo}*{geo_type}*nogene'] for bop_name in bop_names: if bop_name in config.bops: if config.bops[bop_name]['class'] != bop_class: raise ValueError( f'Error: Different classes in BOP creation: {bop_name}' ) config.bops[bop_name]['cpg'].append(cpg) config.bops[bop_name]['map_info'].append(map_info) config.bops[bop_name]['gene'].update(set(genes)) else: config.bops[bop_name] = {} config.bops[bop_name]['class'] = bop_class config.bops[bop_name]['cpg'] = [cpg] config.bops[bop_name]['map_info'] = [map_info] config.bops[bop_name]['gene'] = set(genes) # Sorting cpgs by map_info in gene dict for gene, cpgs in config.gene_cpg_dict.items(): map_infos = [] for cpg in cpgs: map_infos.append(int(config.cpg_map_info_dict[cpg])) order = np.argsort(map_infos) cpgs_sorted = list(np.array(cpgs)[order]) config.gene_cpg_dict[gene] = cpgs_sorted # Sorting cpgs by map_info in bop for bop_name, bop_dict in config.bops.items(): cpg = config.bops[bop_name]['cpg'] map_info = config.bops[bop_name]['map_info'] order = np.argsort(map_info) cpg_sorted = list(np.array(cpg)[order]) config.bops[bop_name]['cpg'] = cpg_sorted aux_data = { 'cpg_list': config.cpg_list, 'cpg_gene_dict': config.cpg_gene_dict, 'gene_cpg_dict': config.gene_cpg_dict, 'cpg_map_info_dict': config.cpg_map_info_dict, 'bops': config.bops } f = open(aux_data_fn, 'wb') pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL) f.close() elif config.annotations.type == 'epityper': if os.path.isfile(aux_data_fn): f = open(aux_data_fn, 'rb') aux_data = pickle.load(f) f.close() config.cpg_list = aux_data['cpg_list'] else: config.cpg_list = [] cpgs_all = config.annotations_dict[config.annotations.id_name] for index, cpg in enumerate(cpgs_all): cpg = cpgs_all[index][0] config.cpg_list.append(cpg) aux_data = { 'cpg_list': config.cpg_list, } f = open(aux_data_fn, 'wb') pickle.dump(aux_data, f, pickle.HIGHEST_PROTOCOL) f.close()