def load_epimutations(config): fn_dict = get_data_base_path(config) + '/' + 'epimutations_dict' suffix = '' if bool(config.experiment.data_params): suffix += '_' + str(config.experiment.get_data_params_str()) fn_data = get_data_base_path(config) + '/' + 'epimutations' + suffix if os.path.isfile(fn_dict + '.pkl') and os.path.isfile(fn_data + '.npz'): f = open(fn_dict + '.pkl', 'rb') config.epimutations_dict = pickle.load(f) f.close() data = np.load(fn_data + '.npz') config.epimutations_data = data['data'] else: config.experiment.data_params = {} load_betas(config) config.epimutations_dict = config.betas_dict f = open(fn_dict + '.pkl', 'wb') pickle.dump(config.epimutations_dict, f, pickle.HIGHEST_PROTOCOL) f.close() save_table_dict_csv( fn_dict, { 'item': list(config.epimutations_dict.keys()), 'row': list(config.epimutations_dict.values()) } ) num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.epimutations_data = np.zeros((num_cpgs, num_subjects), dtype=np.int) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0): betas = config.betas_data[row, :] quartiles = np.percentile(betas, [25, 75]) iqr = quartiles[1] - quartiles[0] left = quartiles[0] - (3.0 * iqr) right = quartiles[1] + (3.0 * iqr) curr_row = np.zeros(num_subjects, dtype=np.int) for subject_id in range(0, num_subjects): curr_point = betas[subject_id] if curr_point < left or curr_point > right: curr_row[subject_id] = 1 config.epimutations_data[row] = curr_row np.savez_compressed(fn_data + '.npz', data=config.epimutations_data) np.savetxt(fn_data + '.txt', config.epimutations_data, delimiter='\t', fmt='%d') # Clear data del config.betas_data
def load_entropy(config): suffix = '' if bool(config.experiment.data_params): data_params = copy.deepcopy(config.experiment.data_params) suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for entropy is empty.') fn_data = get_cache_path(config) + '/' + 'entropy' + suffix + '.npz' config.entropy_list = ['entropy'] config.entropy_dict = {'entropy': 0} config.entropy_missed_dict = {'entropy': []} if os.path.isfile(fn_data): data = np.load(fn_data) config.entropy_data = data['data'] else: if data_params['data'] == 'betas': config.experiment.data_params = {} load_betas(config) data = config.betas_data data_dict = config.betas_dict elif data_params['data'] == 'betas_adj': config.experiment.data_params.pop('data') load_betas_adj(config) data = config.betas_adj_data data_dict = config.betas_adj_dict elif data_params['data'] == 'residuals': config.experiment.data_params.pop('data') load_residuals(config) data = config.residuals_data data_dict = config.residuals_dict else: raise ValueError('Unsupported data for entropy.') num_subjects = data.shape[1] config.entropy_data = np.zeros(num_subjects, dtype=np.float32) rows = [data_dict[item] for item in config.cpg_list if item in data_dict] for subj_id in tqdm(range(0, num_subjects), mininterval=60.0, desc='entropy_data creating'): values = np.squeeze(np.asarray(data[np.ix_(rows, [subj_id])])) entropy = 0.0 outliers = 0 for val in values: if not math.isnan(val): if 0.0 < val < 1.0: entropy += val * np.log2(val) + (1.0 - val) * np.log2(1.0 - val) else: outliers += 1 else: outliers += 1 entropy /= ((len(values) - outliers) * np.log2(0.5)) config.entropy_data[subj_id] = entropy np.savez_compressed(fn_data, data=config.entropy_data)
def test_load_cpg_check_files_creation(self): fn_dict = get_data_base_path(self.config) + '/' + 'betas_dict.pkl' fn_data = get_data_base_path(self.config) + '/' + 'betas' fn_npz = fn_data + '.npz' load_betas(self.config) self.assertEqual(True, os.path.isfile(fn_dict) and os.path.isfile(fn_npz))
def load(self, config, configs_child): load_betas(config) config.base_list = config.cpg_list config.base_dict = config.betas_adj_dict config.base_data = config.betas_adj_data self.inherit_childs(config, configs_child) if config.is_load_child: for config_child in configs_child: self.load_child(config_child)
def load_betas_spec(config): file = config.experiment.data_params.pop('file', None) filename, file_extension = os.path.splitext(file) fn_list = get_data_base_path(config) + '/' + file fn_data_csv = get_data_base_path(config) + '/' + filename + '.csv' if os.path.isfile(fn_list): f = open(fn_list) cpgs = f.readlines() cpgs = [x.rstrip() for x in cpgs] f.close() else: raise ValueError('File with cpgs is not exists.') if not os.path.isfile(fn_data_csv): indexes = config.attributes_indexes fn_betas = get_data_base_path(config) + '/' + 'betas.txt' f = open(fn_betas) header_line = f.readline() headers = get_line_list(header_line) subject_headers = list(np.array(headers[1::])[indexes]) f.close() load_betas(config) num_cols = len(subject_headers) + 1 # header col num_rows = len(cpgs) + 1 # header row betas = np.zeros((num_rows, num_cols), dtype=object) row_id = 0 betas[row_id] = ['ProbeID'] + subject_headers row_id += 1 for cpg in tqdm(cpgs, mininterval=60.0, desc='betas_spec creating'): if cpg in config.betas_dict: cpg_row_id = config.betas_dict[cpg] curr_betas = list( np.array(config.betas_data[cpg_row_id])[indexes]) for missed_id in config.betas_missed_dict[cpg]: curr_betas[missed_id] = 'NaN' line = [cpg] + curr_betas betas[row_id] = line else: line = [cpg] + ['NaN'] * (num_cols - 1) betas[row_id] = line row_id += 1 pd.DataFrame(betas).to_csv(fn_data_csv, index=False, header=False)
def load_betas_horvath_calculator(config): fn_list = get_data_base_path(config) + '/' + 'cpgs_horvath_calculator.txt' fn_data_csv = get_data_base_path(config) + '/' + 'betas_horvath_calculator.csv' suffix = '' if bool(config.experiment.data_params): suffix += '_' + config.experiment.get_data_params_str() if os.path.isfile(fn_list): f = open(fn_list) cpgs = f.readlines() cpgs = [x.rstrip() for x in cpgs] f.close() else: raise ValueError('No specified file with cpgs for Horvath\'s calculator.') if not os.path.isfile(fn_data_csv): fn_betas = get_data_base_path(config) + '/' + 'betas' + suffix + '.txt' f = open(fn_betas) header_line = f.readline() headers = get_line_list(header_line) load_betas(config) num_cols = len(headers) num_rows = len(cpgs) + 1 # header row betas = np.zeros((num_rows, num_cols), dtype=object) row_id = 0 betas[row_id] = ['ProbeID'] + headers[1::] row_id += 1 for cpg in tqdm(cpgs, mininterval=60.0, desc='betas_horvath_calculator creating'): if cpg in config.betas_dict: cpg_row_id = config.betas_dict[cpg] curr_betas = list(config.betas_data[cpg_row_id]) for missed_id in config.betas_missed_dict[cpg]: curr_betas[missed_id] = 'NaN' line = [cpg] + curr_betas betas[row_id] = line else: line = [cpg] + ['NaN'] * (num_cols - 1) betas[row_id] = line row_id += 1 pd.DataFrame(betas).to_csv(fn_data_csv, index=False, header=False)
def load(self, config, configs_child): if config.is_init: source = config.experiment.data_params.pop('source') if source == 'betas': load_betas(config) config.base_missed_dict = config.betas_missed_dict config.base_data = config.betas_data config.target_dict = config.betas_dict elif source == 'residuals': load_residuals(config) config.base_missed_dict = config.residuals_missed_dict config.base_data = config.residuals_data config.target_dict = config.residuals_dict config.base_list = list(config.bops.keys()) config.base_dict = config.bops self.inherit_childs(config, configs_child) if config.is_load_child: for config_child in configs_child: self.load_child(config_child)
def load_genes(config): suffix_gene = '' if bool(config.experiment.data_params): suffix_gene += '_' + str(config.experiment.get_data_params_str()) source = config.experiment.data_params.pop('source') else: raise ValueError('Data params for genes are empty') fn_list_txt = get_cache_path(config) + '/' + 'genes_list.txt' fn_list_pkl = get_cache_path(config) + '/' + 'genes_list.pkl' fn_dict_pkl = get_cache_path(config) + '/' + 'genes_dict.pkl' fn_missed_dict_pkl = get_cache_path(config) + '/' + 'genes_missed_dict.pkl' fn_data_npz = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.npz' fn_data_txt = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.txt' if os.path.isfile(fn_dict_pkl) and os.path.isfile( fn_list_pkl) and os.path.isfile(fn_data_npz): f = open(fn_list_pkl, 'rb') config.genes_list = pickle.load(f) f.close() f = open(fn_dict_pkl, 'rb') config.genes_dict = pickle.load(f) f.close() f = open(fn_missed_dict_pkl, 'rb') config.genes_missed_dict = pickle.load(f) f.close() data = np.load(fn_data_npz) config.genes_data = data['data'] else: if source == 'betas': load_betas(config) source_dict = config.betas_dict source_data = config.betas_data source_missed_dict = config.betas_missed_dict elif source == 'betas_adj': load_betas_adj(config) source_dict = config.betas_adj_dict source_data = config.betas_adj_data source_missed_dict = config.betas_adj_missed_dict elif source == 'residuals': load_residuals(config) source_dict = config.residuals_dict source_data = config.residuals_data source_missed_dict = config.residuals_missed_dict else: raise ValueError('Source for genes is not specified') num_subjects = config.betas_data.shape[1] config.genes_list = [] for gene_id, gene in tqdm(enumerate(config.gene_cpg_dict), mininterval=60.0, desc='genes_list creating'): cpgs = config.gene_cpg_dict[gene] for cpg in cpgs: if cpg in source_dict: config.genes_list.append(gene) break config.genes_dict = {} config.genes_missed_dict = {'any': []} config.genes_data = np.zeros((len(config.genes_list), num_subjects), dtype=np.float32) for gene_id, gene in tqdm(enumerate(config.genes_list), mininterval=60.0, desc='genes_data creating'): config.genes_dict[gene] = gene_id cpgs = config.gene_cpg_dict[gene] denominators = np.zeros(num_subjects, dtype=np.float32) for cpg in cpgs: if cpg in source_dict: row_id = source_dict[cpg] source_values_raw = source_data[row_id, :] if len(source_missed_dict[cpg]) > 0: source_values = np.zeros(num_subjects, dtype=np.float32) for val_id in range(0, num_subjects): if val_id not in source_missed_dict[cpg]: source_values[val_id] = source_values_raw[ val_id] denominators[val_id] += 1.0 else: source_values = source_values_raw denominators += np.ones(num_subjects, dtype=np.float32) config.genes_data[gene_id] += source_values for val_id in range(0, num_subjects): config.genes_data[gene_id][val_id] /= denominators[val_id] f = open(fn_list_pkl, 'wb') pickle.dump(config.genes_list, f, pickle.HIGHEST_PROTOCOL) f.close() f = open(fn_dict_pkl, 'wb') pickle.dump(config.genes_dict, f, pickle.HIGHEST_PROTOCOL) f.close() f = open(fn_missed_dict_pkl, 'wb') pickle.dump(config.genes_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() np.savez_compressed(fn_data_npz, data=config.genes_data) np.savetxt(fn_data_txt, config.genes_data, delimiter='\t', fmt='%.8e') with open(fn_list_txt, 'w') as f: for item in config.genes_list: f.write("%s\n" % item)
def load_betas_adj(config): suffix = '' if bool(config.experiment.data_params): suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for residuals is empty.') fn_dict = get_data_base_path( config) + '/' + 'betas_adj_dict' + suffix + '.pkl' fn_missed_dict = get_data_base_path( config) + '/' + 'betas_adj_missed_dict' + suffix + '.pkl' fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_data): f = open(fn_dict, 'rb') config.betas_adj_dict = pickle.load(f) f.close() f = open(fn_missed_dict, 'rb') config.betas_adj_missed_dict = pickle.load(f) f.close() data = np.load(fn_data) config.betas_adj_data = data['data'] else: load_residuals(config) config.experiment.data_params = {} load_betas(config) config.betas_adj_dict = config.residuals_dict f = open(fn_dict, 'wb') pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL) f.close() config.betas_adj_missed_dict = config.residuals_missed_dict f = open(fn_missed_dict, 'wb') pickle.dump(config.betas_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.betas_adj_data = np.zeros((num_cpgs, num_subjects), dtype=np.float32) for cpg in tqdm(config.betas_adj_dict, mininterval=60.0, desc='betas_adj_data creating'): residuals = config.residuals_data[config.residuals_dict[cpg], :] betas = config.betas_data[config.betas_dict[cpg], :] min_residuals = np.min(residuals) mean_betas = np.mean(betas) shift = mean_betas if min_residuals + shift < 0: shift = abs(min_residuals) betas_adj = residuals + shift config.betas_adj_data[config.residuals_dict[cpg]] = betas_adj np.savez_compressed(fn_data, data=config.betas_adj_data) # Clear data del config.residuals_data del config.betas_data
def load_betas_adj(config): fn_dict = get_data_base_path(config) + '/' + 'betas_adj_dict.pkl' suffix = '' if bool(config.experiment.data_params): data_params = config.experiment.data_params suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError(f'Exog for residuals is empty.') fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_data): f = open(fn_dict, 'rb') config.betas_adj_dict = pickle.load(f) f.close() data = np.load(fn_data) config.betas_adj_data = data['data'] else: config.experiment.data_params = {} load_betas(config) config.betas_adj_dict = config.betas_dict f = open(fn_dict, 'wb') pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL) f.close() exog_dict = {} if 'cells' in data_params: cells_dict = load_cells_dict(config) if isinstance(data_params['cells'], list): all_types = list(cells_dict.keys()) for key in all_types: if key not in data_params['cells']: cells_dict.pop(key) if len(list(cells_dict.keys())) != len(data_params['cells']): raise ValueError(f'Wrong number of cells types.') exog_dict.update(cells_dict) if 'observables' in data_params: observables_dict = load_observables_dict(config) if isinstance(data_params['observables'], list): all_types = list(observables_dict.keys()) for key in all_types: if key not in data_params['observables']: observables_dict.pop(key) if len(list(observables_dict.keys())) != len( data_params['observables']): raise ValueError(f'Wrong number of observables types.') exog_dict.update(observables_dict) exog_df = pd.DataFrame(exog_dict) num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.betas_adj_data = np.zeros((num_cpgs, num_subjects), dtype=np.float32) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0, desc='betas_adj_data creating'): betas = config.betas_data[row, :] mean = np.mean(betas) endog_dict = {cpg: betas} endog_df = pd.DataFrame(endog_dict) reg_res = sm.OLS(endog=endog_df, exog=exog_df).fit() residuals = list(map(np.float32, reg_res.resid)) betas_adj = residuals + mean config.betas_adj_data[row] = betas_adj np.savez_compressed(fn_data, data=config.betas_adj_data) # Clear data del config.betas_data
def load_resid_old(config): suffix = '' if bool(config.experiment.data_params): data_params = config.experiment.data_params suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for resid_old is empty.') fn_dict = get_data_base_path( config) + '/' + 'resid_old_dict' + suffix + '.pkl' fn_missed_dict = get_data_base_path( config) + '/' + 'resid_old_missed_dict' + suffix + '.pkl' fn_data = get_data_base_path(config) + '/' + 'resid_old' + suffix + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_data): f = open(fn_dict, 'rb') config.resid_old_dict = pickle.load(f) f.close() f = open(fn_missed_dict, 'rb') config.resid_old_missed_dict = pickle.load(f) f.close() data = np.load(fn_data) config.resid_old_data = data['data'] else: data_params_copy = copy.deepcopy(config.experiment.data_params) common_keys = ['norm'] config.experiment.data_params = {} for key in common_keys: if key in data_params_copy: config.experiment.data_params[key] = data_params_copy[key] load_betas(config) config.resid_old_dict = config.betas_dict f = open(fn_dict, 'wb') pickle.dump(config.resid_old_dict, f, pickle.HIGHEST_PROTOCOL) f.close() config.resid_old_missed_dict = config.betas_missed_dict f = open(fn_missed_dict, 'wb') pickle.dump(config.resid_old_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() exog_dict = {} if 'cells' in data_params: cells_dict = load_cells_dict(config) if isinstance(data_params['cells'], list): all_types = list(cells_dict.keys()) for key in all_types: if key not in data_params['cells']: cells_dict.pop(key) if len(list(cells_dict.keys())) != len(data_params['cells']): raise ValueError('Wrong number of cells types.') exog_dict.update(cells_dict) if 'observables' in data_params: observables_categorical_dict = load_observables_categorical_dict( config) if isinstance(data_params['observables'], list): all_types = list(observables_categorical_dict.keys()) for key in all_types: if key not in data_params['observables']: observables_categorical_dict.pop(key) if len(list(observables_categorical_dict.keys())) != len( data_params['observables']): raise ValueError('Wrong number of observables types.') exog_dict.update(observables_categorical_dict) num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.resid_old_data = np.zeros((num_cpgs, num_subjects), dtype=np.float32) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0, desc='resid_old_data creating'): raw_betas = config.betas_data[row, :] current_exog_dict = copy.deepcopy(exog_dict) if len(config.betas_missed_dict[cpg]) > 0: for key in current_exog_dict: values = [] for value_id in range(0, len(current_exog_dict[key])): if value_id not in config.betas_missed_dict[cpg]: values.append(current_exog_dict[key][value_id]) current_exog_dict[key] = values betas = [] passed_ids = [] for beta_id in range(0, len(raw_betas)): if beta_id not in config.betas_missed_dict[cpg]: betas.append(raw_betas[beta_id]) passed_ids.append(beta_id) else: betas = raw_betas passed_ids = list(range(0, len(betas))) endog_dict = {cpg: betas} endog_df = pd.DataFrame(endog_dict) exog_df = pd.DataFrame(current_exog_dict) reg_res = sm.OLS(endog=endog_df, exog=exog_df).fit() resid_old = list(map(np.float32, reg_res.resid)) resid_old_raw = np.zeros(num_subjects, dtype=np.float32) for beta_id in range(0, len(passed_ids)): resid_old_raw[passed_ids[beta_id]] = resid_old[beta_id] for missed_id in config.resid_old_missed_dict[cpg]: resid_old_raw[missed_id] = np.float32('nan') config.resid_old_data[row] = resid_old_raw np.savez_compressed(fn_data, data=config.resid_old_data) # Clear data del config.betas_data
def test_load_cpg_check_shape_cpg_data(self): load_betas(self.config) self.assertEqual((300, 729), self.config.betas_data.shape)
def test_load_cpg_check_len_cpg_dict(self): load_betas(self.config) self.assertEqual(300, len(list(self.config.betas_dict)))
def load_epimutations(config): suffix = '' if bool(config.experiment.data_params): suffix += '_' + str(config.experiment.get_data_params_str()) fn_data = get_data_base_path(config) + '/' + 'epimutations' + suffix config.epimutations_list = ['epimutations'] config.epimutations_dict = {'epimutations': 0} config.epimutations_missed_dict = {'epimutations': []} data_params_copy = copy.deepcopy(config.experiment.data_params) common_keys = ['part', 'norm'] config.experiment.data_params = {} for key in common_keys: if key in data_params_copy: config.experiment.data_params[key] = data_params_copy[key] load_betas(config) if os.path.isfile(fn_data + '.npz'): data = np.load(fn_data + '.npz') config.epimutations_data = data['data'] else: num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.epimutations_data = np.zeros((num_cpgs, num_subjects), dtype=np.int) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0, desc='epimutations_data creating'): betas_raw = config.betas_data[row, :] if len(config.betas_missed_dict[cpg]) > 0: betas = [] for beta_id in range(0, len(betas_raw)): if beta_id not in config.betas_missed_dict[cpg]: betas.append(betas_raw[beta_id]) else: betas = betas_raw quartiles = np.percentile(betas, [25, 75]) iqr = quartiles[1] - quartiles[0] left = quartiles[0] - (3.0 * iqr) right = quartiles[1] + (3.0 * iqr) curr_row = np.zeros(num_subjects, dtype=np.int) for subject_id in range(0, num_subjects): curr_point = betas[subject_id] if not math.isnan(curr_point): if curr_point < left or curr_point > right: curr_row[subject_id] = 1 config.epimutations_data[row] = curr_row np.savez_compressed(fn_data + '.npz', data=config.epimutations_data) np.savetxt(fn_data + '.txt', config.epimutations_data, delimiter='\t', fmt='%d') # Clear data del config.betas_data