def load_epimutations(config): fn_dict = get_data_base_path(config) + '/' + 'epimutations_dict' suffix = '' if bool(config.experiment.data_params): suffix += '_' + str(config.experiment.get_data_params_str()) fn_data = get_data_base_path(config) + '/' + 'epimutations' + suffix if os.path.isfile(fn_dict + '.pkl') and os.path.isfile(fn_data + '.npz'): f = open(fn_dict + '.pkl', 'rb') config.epimutations_dict = pickle.load(f) f.close() data = np.load(fn_data + '.npz') config.epimutations_data = data['data'] else: config.experiment.data_params = {} load_betas(config) config.epimutations_dict = config.betas_dict f = open(fn_dict + '.pkl', 'wb') pickle.dump(config.epimutations_dict, f, pickle.HIGHEST_PROTOCOL) f.close() save_table_dict_csv( fn_dict, { 'item': list(config.epimutations_dict.keys()), 'row': list(config.epimutations_dict.values()) } ) num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.epimutations_data = np.zeros((num_cpgs, num_subjects), dtype=np.int) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0): betas = config.betas_data[row, :] quartiles = np.percentile(betas, [25, 75]) iqr = quartiles[1] - quartiles[0] left = quartiles[0] - (3.0 * iqr) right = quartiles[1] + (3.0 * iqr) curr_row = np.zeros(num_subjects, dtype=np.int) for subject_id in range(0, num_subjects): curr_point = betas[subject_id] if curr_point < left or curr_point > right: curr_row[subject_id] = 1 config.epimutations_data[row] = curr_row np.savez_compressed(fn_data + '.npz', data=config.epimutations_data) np.savetxt(fn_data + '.txt', config.epimutations_data, delimiter='\t', fmt='%d') # Clear data del config.betas_data
def test_load_residuals_check_files_creation(self): suffix = '_' + self.config.experiment.get_data_params_str() fn_dict = get_data_base_path(self.config) + '/' + 'residuals_common_dict.pkl' fn_data = get_data_base_path(self.config) + '/' + 'residuals_common' + suffix + '.npz' load_residuals_common(self.config) self.assertEqual(True, os.path.isfile(fn_dict) and os.path.isfile(fn_data))
def load_betas(config): fn_dict = get_data_base_path(config) + '/' + 'betas_dict.pkl' suffix = '' if bool(config.experiment.data_params): suffix += '_' + str(config.experiment.get_data_params_str()) fn_data = get_data_base_path(config) + '/' + 'betas' + suffix fn_txt = fn_data + '.txt' fn_npz = fn_data + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_npz): f = open(fn_dict, 'rb') config.betas_dict = pickle.load(f) f.close() data = np.load(fn_npz) config.betas_data = data['data'] else: config.betas_dict = {} f = open(fn_txt) f.readline() cpg_id = 0 for line in tqdm(f, mininterval=60.0, desc='betas_dict creating'): line_list = get_line_list(line) cpg = line_list[0] config.betas_dict[cpg] = cpg_id cpg_id += 1 f.close() f = open(fn_dict, 'wb') pickle.dump(config.betas_dict, f, pickle.HIGHEST_PROTOCOL) f.close() num_cpgs = cpg_id f = open(fn_txt) header_line = f.readline() headers = header_line.split('\t') headers = [x.rstrip() for x in headers] subjects = headers[1:len(headers)] config.betas_data = np.zeros((num_cpgs, len(subjects)), dtype=np.float32) cpg_id = 0 for line in tqdm(f, mininterval=60.0, desc='betas_data creating'): line_list = get_line_list(line) curr_data = list(map(np.float32, line_list[1::])) config.betas_data[cpg_id] = curr_data cpg_id += 1 f.close() np.savez_compressed(fn_npz, data=config.betas_data)
def test_load_cpg_check_files_creation(self): fn_dict = get_data_base_path(self.config) + '/' + 'betas_dict.pkl' fn_data = get_data_base_path(self.config) + '/' + 'betas' fn_npz = fn_data + '.npz' load_betas(self.config) self.assertEqual(True, os.path.isfile(fn_dict) and os.path.isfile(fn_npz))
def load_betas_spec(config): file = config.experiment.data_params.pop('file', None) filename, file_extension = os.path.splitext(file) fn_list = get_data_base_path(config) + '/' + file fn_data_csv = get_data_base_path(config) + '/' + filename + '.csv' if os.path.isfile(fn_list): f = open(fn_list) cpgs = f.readlines() cpgs = [x.rstrip() for x in cpgs] f.close() else: raise ValueError('File with cpgs is not exists.') if not os.path.isfile(fn_data_csv): indexes = config.attributes_indexes fn_betas = get_data_base_path(config) + '/' + 'betas.txt' f = open(fn_betas) header_line = f.readline() headers = get_line_list(header_line) subject_headers = list(np.array(headers[1::])[indexes]) f.close() load_betas(config) num_cols = len(subject_headers) + 1 # header col num_rows = len(cpgs) + 1 # header row betas = np.zeros((num_rows, num_cols), dtype=object) row_id = 0 betas[row_id] = ['ProbeID'] + subject_headers row_id += 1 for cpg in tqdm(cpgs, mininterval=60.0, desc='betas_spec creating'): if cpg in config.betas_dict: cpg_row_id = config.betas_dict[cpg] curr_betas = list( np.array(config.betas_data[cpg_row_id])[indexes]) for missed_id in config.betas_missed_dict[cpg]: curr_betas[missed_id] = 'NaN' line = [cpg] + curr_betas betas[row_id] = line else: line = [cpg] + ['NaN'] * (num_cols - 1) betas[row_id] = line row_id += 1 pd.DataFrame(betas).to_csv(fn_data_csv, index=False, header=False)
def load_cpg(config): fn_dict = get_data_base_path(config) + '/' + 'cpg_dict.pkl' fn_data = get_data_base_path(config) + '/' + config.data.name fn_txt = fn_data + '.txt' fn_npz = fn_data + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_npz): f = open(fn_dict, 'rb') config.cpg_dict = pickle.load(f) f.close() data = np.load(fn_npz) config.cpg_data = data['cpg_data'] else: config.cpg_dict = {} f = open(fn_txt) f.readline() cpg_id = 0 for line in f: line_list = get_line_list(line) cpg = line_list[0] config.cpg_dict[cpg] = cpg_id cpg_id += 1 f.close() f = open(fn_dict, 'wb') pickle.dump(config.cpg_dict, f, pickle.HIGHEST_PROTOCOL) f.close() num_cpgs = cpg_id f = open(fn_txt) header_line = f.readline() headers = header_line.split('\t') headers = [x.rstrip() for x in headers] subjects = headers[1:len(headers)] config.cpg_data = np.zeros((num_cpgs, len(subjects)), dtype=np.float32) cpg_id = 0 for line in f: line_list = get_line_list(line) curr_data = list(map(np.float32, line_list[1::])) config.cpg_data[cpg_id] = curr_data cpg_id += 1 f.close() np.savez_compressed(fn_npz, cpg_data=config.cpg_data)
def load_betas_horvath_calculator(config): fn_list = get_data_base_path(config) + '/' + 'cpgs_horvath_calculator.txt' fn_data_csv = get_data_base_path(config) + '/' + 'betas_horvath_calculator.csv' suffix = '' if bool(config.experiment.data_params): suffix += '_' + config.experiment.get_data_params_str() if os.path.isfile(fn_list): f = open(fn_list) cpgs = f.readlines() cpgs = [x.rstrip() for x in cpgs] f.close() else: raise ValueError('No specified file with cpgs for Horvath\'s calculator.') if not os.path.isfile(fn_data_csv): fn_betas = get_data_base_path(config) + '/' + 'betas' + suffix + '.txt' f = open(fn_betas) header_line = f.readline() headers = get_line_list(header_line) load_betas(config) num_cols = len(headers) num_rows = len(cpgs) + 1 # header row betas = np.zeros((num_rows, num_cols), dtype=object) row_id = 0 betas[row_id] = ['ProbeID'] + headers[1::] row_id += 1 for cpg in tqdm(cpgs, mininterval=60.0, desc='betas_horvath_calculator creating'): if cpg in config.betas_dict: cpg_row_id = config.betas_dict[cpg] curr_betas = list(config.betas_data[cpg_row_id]) for missed_id in config.betas_missed_dict[cpg]: curr_betas[missed_id] = 'NaN' line = [cpg] + curr_betas betas[row_id] = line else: line = [cpg] + ['NaN'] * (num_cols - 1) betas[row_id] = line row_id += 1 pd.DataFrame(betas).to_csv(fn_data_csv, index=False, header=False)
def load_excluded(config): exclude = [] if config.annotations.exclude != CommonTypes.none.value: fn = get_data_base_path(config) + '/' + config.annotations.exclude fn_txt = fn + '.txt' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') exclude = pickle.load(f) f.close() else: f = open(fn_txt) exclude = f.readlines() exclude = [x.rstrip() for x in exclude] f.close() f = open(fn_pkl, 'wb') pickle.dump(exclude, f, pickle.HIGHEST_PROTOCOL) f.close() return exclude
def clear_cache(config): path = get_data_base_path(config) exts = ('.npz', '.pkl') for root, dirs, files in os.walk(path): for currentFile in files: if currentFile.lower().endswith(exts): os.remove(os.path.join(root, currentFile))
def test_load_annotations_check_pkl_file_creation(self): load_annotations_dict(self.config) create = os.path.isfile( get_data_base_path(self.config) + '/' + self.config.annotations.name + '.pkl') self.assertEqual(True, create)
def test_load_excluded_check_pkl_creation(self): self.config.annotations.exclude = 'excluded' fn = get_data_base_path( self.config) + '/' + self.config.annotations.exclude + '.pkl' self.config.excluded = load_excluded(self.config) self.assertEqual(True, os.path.isfile(fn))
def test_load_observables_dict_check_pkl_file_creation(self): load_observables_dict(self.config) create = os.path.isfile( get_data_base_path(self.config) + '/' + self.config.attributes.observables.name + '.pkl') self.assertEqual(True, create)
def load_observables_dict(config): fn = get_data_base_path(config) + '/' + config.attributes.observables.name fn_txt = fn + '.txt' fn_xlsx = fn + '.xlsx' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') observables_dict = pickle.load(f) f.close() else: if os.path.isfile(fn_xlsx): df = pd.read_excel(fn_xlsx) tmp_dict = df.to_dict() observables_dict = {} for key in tmp_dict: curr_dict = tmp_dict[key] observables_dict[key] = list(curr_dict.values()) elif os.path.isfile(fn_txt): f = open(fn_txt) key_line = f.readline() keys = key_line.split('\t') keys = [x.rstrip() for x in keys] observables_dict = {} for key in keys: observables_dict[key] = [] for line in f: values = line.split('\t') for key_id in range(0, len(keys)): key = keys[key_id] value = values[key_id].rstrip() if is_float(value): value = float(value) if value.is_integer(): observables_dict[key].append(int(value)) else: observables_dict[key].append(float(value)) else: observables_dict[key].append(value) f.close() else: raise ValueError('No observables file') f = open(fn_pkl, 'wb') pickle.dump(observables_dict, f, pickle.HIGHEST_PROTOCOL) f.close() return observables_dict
def load_cells_dict(config): fn = get_data_base_path(config) + '/' + config.attributes.cells.name fn_txt = fn + '.txt' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') cells_dict = pickle.load(f) f.close() else: f = open(fn_txt) key_line = f.readline() keys = key_line.split('\t') # First column is always sample name keys = [x.rstrip() for x in keys][1::] if isinstance(config.attributes.cells.types, list): possible_keys = config.attributes.cells.types elif config.attributes.cells.types == CommonTypes.any.value: possible_keys = keys else: possible_keys = [] cells_dict = {} for key in keys: if key in possible_keys: cells_dict[key] = [] for line in f: values = line.split('\t')[1::] for key_id in range(0, len(keys)): key = keys[key_id] if key in possible_keys: value = values[key_id].rstrip() if is_float(value): cells_dict[key].append(float(value)) else: cells_dict[key].append(value) f.close() f = open(fn_pkl, 'wb') pickle.dump(cells_dict, f, pickle.HIGHEST_PROTOCOL) f.close() return cells_dict
def load_observables_categorical_dict(config): fn = get_data_base_path(config) + '/' + config.attributes.observables.name + '_categorical' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') observables_categorical_dict = pickle.load(f) f.close() else: observables_categorical_dict = {} if config.observables_dict is not None: observables_dict = config.observables_dict else: observables_dict = load_observables_dict(config) na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null', '-', '--'] for key in observables_dict: all_numeric = True for i in range(0, len(observables_dict[key])): value = observables_dict[key][i] if value in na_values: value = np.nan if is_float(value): value = float(value) if value.is_integer(): observables_dict[key][i] = value else: observables_dict[key][i] = float(value) else: observables_dict[key][i] = value all_numeric = False if all_numeric: observables_categorical_dict[key] = np.asarray(config.observables_dict[key]) else: observables_categorical_dict[key] = categorize_data(np.asarray(config.observables_dict[key])) f = open(fn_pkl, 'wb') pickle.dump(observables_categorical_dict, f, pickle.HIGHEST_PROTOCOL) f.close() return observables_categorical_dict
def load_attributes_dict(config): fn = get_data_base_path(config) + '/' + \ config.attributes.observables.name fn_txt = fn + '.txt' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') attributes_dict = pickle.load(f) f.close() else: f = open(fn_txt) key_line = f.readline() keys = key_line.split('\t') keys = [x.rstrip() for x in keys] possible_keys = list(config.attributes.observables.types.keys()) possible_keys = [config.attributes.target] + possible_keys attributes_dict = {} for key in keys: if key in possible_keys: attributes_dict[key] = [] for line in f: values = line.split('\t') for key_id in range(0, len(keys)): key = keys[key_id] if key in possible_keys: value = values[key_id].rstrip() if is_float(value): value = float(value) if value.is_integer(): attributes_dict[key].append(int(value)) else: attributes_dict[key].append(float(value)) else: attributes_dict[key].append(value) f.close() f = open(fn_pkl, 'wb') pickle.dump(attributes_dict, f, pickle.HIGHEST_PROTOCOL) f.close() return attributes_dict
def load_cells_dict(config): fn = get_data_base_path(config) + '/' + config.attributes.cells.name fn_txt = fn + '.txt' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') cells_dict = pickle.load(f) f.close() elif not os.path.isfile(fn_txt): return None else: f = open(fn_txt) key_line = f.readline() keys = key_line.split('\t') keys = [x.rstrip() for x in keys] cells_dict = {} for key in keys: cells_dict[key] = [] for line in f: values = line.split('\t') for key_id in range(0, len(keys)): key = keys[key_id] value = values[key_id].rstrip() if is_float(value): cells_dict[key].append(float(value)) else: cells_dict[key].append(value) f.close() f = open(fn_pkl, 'wb') pickle.dump(cells_dict, f, pickle.HIGHEST_PROTOCOL) f.close() return cells_dict
def load_annotations_dict(config): fn = get_data_base_path(config) + '/' + config.annotations.name fn_txt = fn + '.txt' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') annotations_dict = pickle.load(f) f.close() else: f = open(fn_txt) key_line = f.readline() keys = key_line.split('\t') keys = [x.rstrip() for x in keys] annotations_dict = {} for key in keys: annotations_dict[key] = [] for line in f: values = line.split('\t') for key_id, key in enumerate(keys): values_for_key = values[key_id].rstrip() if values_for_key == '': annotations_dict[key].append([]) else: values_for_key = values_for_key.split(';') values_for_key = list(set(values_for_key)) annotations_dict[key].append(values_for_key) f.close() f = open(fn_pkl, 'wb') pickle.dump(annotations_dict, f, pickle.HIGHEST_PROTOCOL) f.close() return annotations_dict
def load_annotations_dict(config): fn = get_data_base_path(config) + '/' + config.annotations.name fn_txt = fn + '.txt' fn_pkl = fn + '.pkl' if os.path.isfile(fn_pkl): f = open(fn_pkl, 'rb') annotations_dict = pickle.load(f) f.close() else: possible_keys = [x.value for x in AnnotationKey] f = open(fn_txt) key_line = f.readline() keys = key_line.split('\t') keys = [x.rstrip() for x in keys] annotations_dict = {} for key in keys: if key in possible_keys: annotations_dict[key] = [] for line in f: values = line.split('\t') for key_id in range(0, len(keys)): key = keys[key_id] if key in possible_keys: annotations_dict[key].append(values[key_id].rstrip()) f.close() f = open(fn_pkl, 'wb') pickle.dump(annotations_dict, f, pickle.HIGHEST_PROTOCOL) f.close() return annotations_dict
def load_epimutations(config): suffix = '' if bool(config.experiment.data_params): suffix += '_' + str(config.experiment.get_data_params_str()) fn_data = get_data_base_path(config) + '/' + 'epimutations' + suffix config.epimutations_list = ['epimutations'] config.epimutations_dict = {'epimutations': 0} config.epimutations_missed_dict = {'epimutations': []} data_params_copy = copy.deepcopy(config.experiment.data_params) common_keys = ['part', 'norm'] config.experiment.data_params = {} for key in common_keys: if key in data_params_copy: config.experiment.data_params[key] = data_params_copy[key] load_betas(config) if os.path.isfile(fn_data + '.npz'): data = np.load(fn_data + '.npz') config.epimutations_data = data['data'] else: num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.epimutations_data = np.zeros((num_cpgs, num_subjects), dtype=np.int) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0, desc='epimutations_data creating'): betas_raw = config.betas_data[row, :] if len(config.betas_missed_dict[cpg]) > 0: betas = [] for beta_id in range(0, len(betas_raw)): if beta_id not in config.betas_missed_dict[cpg]: betas.append(betas_raw[beta_id]) else: betas = betas_raw quartiles = np.percentile(betas, [25, 75]) iqr = quartiles[1] - quartiles[0] left = quartiles[0] - (3.0 * iqr) right = quartiles[1] + (3.0 * iqr) curr_row = np.zeros(num_subjects, dtype=np.int) for subject_id in range(0, num_subjects): curr_point = betas[subject_id] if not math.isnan(curr_point): if curr_point < left or curr_point > right: curr_row[subject_id] = 1 config.epimutations_data[row] = curr_row np.savez_compressed(fn_data + '.npz', data=config.epimutations_data) np.savetxt(fn_data + '.txt', config.epimutations_data, delimiter='\t', fmt='%d') # Clear data del config.betas_data
def load_resid_old(config): suffix = '' if bool(config.experiment.data_params): data_params = config.experiment.data_params suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for resid_old is empty.') fn_dict = get_data_base_path( config) + '/' + 'resid_old_dict' + suffix + '.pkl' fn_missed_dict = get_data_base_path( config) + '/' + 'resid_old_missed_dict' + suffix + '.pkl' fn_data = get_data_base_path(config) + '/' + 'resid_old' + suffix + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_data): f = open(fn_dict, 'rb') config.resid_old_dict = pickle.load(f) f.close() f = open(fn_missed_dict, 'rb') config.resid_old_missed_dict = pickle.load(f) f.close() data = np.load(fn_data) config.resid_old_data = data['data'] else: data_params_copy = copy.deepcopy(config.experiment.data_params) common_keys = ['norm'] config.experiment.data_params = {} for key in common_keys: if key in data_params_copy: config.experiment.data_params[key] = data_params_copy[key] load_betas(config) config.resid_old_dict = config.betas_dict f = open(fn_dict, 'wb') pickle.dump(config.resid_old_dict, f, pickle.HIGHEST_PROTOCOL) f.close() config.resid_old_missed_dict = config.betas_missed_dict f = open(fn_missed_dict, 'wb') pickle.dump(config.resid_old_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() exog_dict = {} if 'cells' in data_params: cells_dict = load_cells_dict(config) if isinstance(data_params['cells'], list): all_types = list(cells_dict.keys()) for key in all_types: if key not in data_params['cells']: cells_dict.pop(key) if len(list(cells_dict.keys())) != len(data_params['cells']): raise ValueError('Wrong number of cells types.') exog_dict.update(cells_dict) if 'observables' in data_params: observables_categorical_dict = load_observables_categorical_dict( config) if isinstance(data_params['observables'], list): all_types = list(observables_categorical_dict.keys()) for key in all_types: if key not in data_params['observables']: observables_categorical_dict.pop(key) if len(list(observables_categorical_dict.keys())) != len( data_params['observables']): raise ValueError('Wrong number of observables types.') exog_dict.update(observables_categorical_dict) num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.resid_old_data = np.zeros((num_cpgs, num_subjects), dtype=np.float32) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0, desc='resid_old_data creating'): raw_betas = config.betas_data[row, :] current_exog_dict = copy.deepcopy(exog_dict) if len(config.betas_missed_dict[cpg]) > 0: for key in current_exog_dict: values = [] for value_id in range(0, len(current_exog_dict[key])): if value_id not in config.betas_missed_dict[cpg]: values.append(current_exog_dict[key][value_id]) current_exog_dict[key] = values betas = [] passed_ids = [] for beta_id in range(0, len(raw_betas)): if beta_id not in config.betas_missed_dict[cpg]: betas.append(raw_betas[beta_id]) passed_ids.append(beta_id) else: betas = raw_betas passed_ids = list(range(0, len(betas))) endog_dict = {cpg: betas} endog_df = pd.DataFrame(endog_dict) exog_df = pd.DataFrame(current_exog_dict) reg_res = sm.OLS(endog=endog_df, exog=exog_df).fit() resid_old = list(map(np.float32, reg_res.resid)) resid_old_raw = np.zeros(num_subjects, dtype=np.float32) for beta_id in range(0, len(passed_ids)): resid_old_raw[passed_ids[beta_id]] = resid_old[beta_id] for missed_id in config.resid_old_missed_dict[cpg]: resid_old_raw[missed_id] = np.float32('nan') config.resid_old_data[row] = resid_old_raw np.savez_compressed(fn_data, data=config.resid_old_data) # Clear data del config.betas_data
def load_betas_adj(config): fn_dict = get_data_base_path(config) + '/' + 'betas_adj_dict.pkl' suffix = '' if bool(config.experiment.data_params): data_params = config.experiment.data_params suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError(f'Exog for residuals is empty.') fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_data): f = open(fn_dict, 'rb') config.betas_adj_dict = pickle.load(f) f.close() data = np.load(fn_data) config.betas_adj_data = data['data'] else: config.experiment.data_params = {} load_betas(config) config.betas_adj_dict = config.betas_dict f = open(fn_dict, 'wb') pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL) f.close() exog_dict = {} if 'cells' in data_params: cells_dict = load_cells_dict(config) if isinstance(data_params['cells'], list): all_types = list(cells_dict.keys()) for key in all_types: if key not in data_params['cells']: cells_dict.pop(key) if len(list(cells_dict.keys())) != len(data_params['cells']): raise ValueError(f'Wrong number of cells types.') exog_dict.update(cells_dict) if 'observables' in data_params: observables_dict = load_observables_dict(config) if isinstance(data_params['observables'], list): all_types = list(observables_dict.keys()) for key in all_types: if key not in data_params['observables']: observables_dict.pop(key) if len(list(observables_dict.keys())) != len( data_params['observables']): raise ValueError(f'Wrong number of observables types.') exog_dict.update(observables_dict) exog_df = pd.DataFrame(exog_dict) num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.betas_adj_data = np.zeros((num_cpgs, num_subjects), dtype=np.float32) for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0, desc='betas_adj_data creating'): betas = config.betas_data[row, :] mean = np.mean(betas) endog_dict = {cpg: betas} endog_df = pd.DataFrame(endog_dict) reg_res = sm.OLS(endog=endog_df, exog=exog_df).fit() residuals = list(map(np.float32, reg_res.resid)) betas_adj = residuals + mean config.betas_adj_data[row] = betas_adj np.savez_compressed(fn_data, data=config.betas_adj_data) # Clear data del config.betas_data
def load_betas(config): suffix = '' if bool(config.experiment.data_params): suffix += '_' + str(config.experiment.get_data_params_str()) fn_dict = get_data_base_path(config) + '/' + 'betas_dict' + suffix + '.pkl' fn_missed_dict = get_data_base_path(config) + '/' + 'betas_missed_dict' + suffix + '.pkl' fn_data = get_data_base_path(config) + '/' + 'betas' + suffix fn_txt = fn_data + '.txt' fn_npz = fn_data + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_dict) and os.path.isfile(fn_npz): f = open(fn_dict, 'rb') config.betas_dict = pickle.load(f) f.close() f = open(fn_missed_dict, 'rb') config.betas_missed_dict = pickle.load(f) f.close() data = np.load(fn_npz) config.betas_data = data['data'] else: config.betas_dict = {} config.betas_missed_dict = {} config.betas_missed_dict['any'] = [] f = open(fn_txt) f.readline() cpg_id = 0 for line in tqdm(f, mininterval=60.0, desc='betas_dict creating'): line_list = get_line_list(line) cpg = line_list[0] betas = line_list[1::] missed_indexes = [] for missed_value in config.annotations.missed_values: indexes = [i for i, x in enumerate(betas) if x == missed_value] missed_indexes += indexes missed_indexes.sort() config.betas_missed_dict[cpg] = missed_indexes config.betas_dict[cpg] = cpg_id cpg_id += 1 f.close() f = open(fn_dict, 'wb') pickle.dump(config.betas_dict, f, pickle.HIGHEST_PROTOCOL) f.close() f = open(fn_missed_dict, 'wb') pickle.dump(config.betas_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() num_cpgs = cpg_id f = open(fn_txt) header_line = f.readline() headers = get_line_list(header_line) subjects = headers[1:len(headers)] config.betas_data = np.zeros((num_cpgs, len(subjects)), dtype=np.float32) cpg_id = 0 for line in tqdm(f, mininterval=60.0, desc='betas_data creating'): line_list = get_line_list(line) cpg = line_list[0] betas = line_list[1::] for beta_id in range(0, len(betas)): if beta_id in config.betas_missed_dict[cpg]: betas[beta_id] = np.float32('nan') else: betas[beta_id] = np.float32(betas[beta_id]) config.betas_data[cpg_id] = betas cpg_id += 1 f.close() np.savez_compressed(fn_npz, data=config.betas_data)
def load_betas_adj(config): suffix = '' if bool(config.experiment.data_params): suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for residuals is empty.') fn_dict = get_data_base_path( config) + '/' + 'betas_adj_dict' + suffix + '.pkl' fn_missed_dict = get_data_base_path( config) + '/' + 'betas_adj_missed_dict' + suffix + '.pkl' fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_data): f = open(fn_dict, 'rb') config.betas_adj_dict = pickle.load(f) f.close() f = open(fn_missed_dict, 'rb') config.betas_adj_missed_dict = pickle.load(f) f.close() data = np.load(fn_data) config.betas_adj_data = data['data'] else: load_residuals(config) config.experiment.data_params = {} load_betas(config) config.betas_adj_dict = config.residuals_dict f = open(fn_dict, 'wb') pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL) f.close() config.betas_adj_missed_dict = config.residuals_missed_dict f = open(fn_missed_dict, 'wb') pickle.dump(config.betas_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.betas_adj_data = np.zeros((num_cpgs, num_subjects), dtype=np.float32) for cpg in tqdm(config.betas_adj_dict, mininterval=60.0, desc='betas_adj_data creating'): residuals = config.residuals_data[config.residuals_dict[cpg], :] betas = config.betas_data[config.betas_dict[cpg], :] min_residuals = np.min(residuals) mean_betas = np.mean(betas) shift = mean_betas if min_residuals + shift < 0: shift = abs(min_residuals) betas_adj = residuals + shift config.betas_adj_data[config.residuals_dict[cpg]] = betas_adj np.savez_compressed(fn_data, data=config.betas_adj_data) # Clear data del config.residuals_data del config.betas_data