Esempio n. 1
0
def load_entropy(config):
    suffix = ''
    if bool(config.experiment.data_params):
        data_params = copy.deepcopy(config.experiment.data_params)
        suffix += '_' + config.experiment.get_data_params_str()
    else:
        raise ValueError('Exog for entropy is empty.')

    fn_data = get_cache_path(config) + '/' + 'entropy' + suffix + '.npz'

    config.entropy_list = ['entropy']
    config.entropy_dict = {'entropy': 0}
    config.entropy_missed_dict = {'entropy': []}

    if os.path.isfile(fn_data):

        data = np.load(fn_data)
        config.entropy_data = data['data']

    else:

        if data_params['data'] == 'betas':
            config.experiment.data_params = {}
            load_betas(config)
            data = config.betas_data
            data_dict = config.betas_dict
        elif data_params['data'] == 'betas_adj':
            config.experiment.data_params.pop('data')
            load_betas_adj(config)
            data = config.betas_adj_data
            data_dict = config.betas_adj_dict
        elif data_params['data'] == 'residuals':
            config.experiment.data_params.pop('data')
            load_residuals(config)
            data = config.residuals_data
            data_dict = config.residuals_dict
        else:
            raise ValueError('Unsupported data for entropy.')

        num_subjects = data.shape[1]
        config.entropy_data = np.zeros(num_subjects, dtype=np.float32)

        rows = [data_dict[item] for item in config.cpg_list if item in data_dict]

        for subj_id in tqdm(range(0, num_subjects), mininterval=60.0, desc='entropy_data creating'):
            values = np.squeeze(np.asarray(data[np.ix_(rows, [subj_id])]))
            entropy = 0.0
            outliers = 0
            for val in values:
                if not math.isnan(val):
                    if 0.0 < val < 1.0:
                        entropy += val * np.log2(val) + (1.0 - val) * np.log2(1.0 - val)
                    else:
                        outliers += 1
                else:
                    outliers += 1
            entropy /= ((len(values) - outliers) * np.log2(0.5))
            config.entropy_data[subj_id] = entropy

        np.savez_compressed(fn_data, data=config.entropy_data)
    def test_load_residuals_check_files_creation(self):
        suffix = '_' + self.config.experiment.get_data_params_str()
        fn_dict = get_data_base_path(
            self.config) + '/' + 'residuals_dict' + suffix + '.pkl'
        fn_data = get_data_base_path(
            self.config) + '/' + 'residuals' + suffix + '.npz'

        load_residuals(self.config)

        self.assertEqual(True,
                         os.path.isfile(fn_dict) and os.path.isfile(fn_data))
Esempio n. 3
0
    def load(self, config, configs_child):
        if config.is_init:
            load_residuals(config)
            config.base_list = config.cpg_list
            config.base_dict = config.residuals_dict
            config.base_data = config.residuals_data
            config.base_missed_dict = config.residuals_missed_dict

            self.inherit_childs(config, configs_child)

        if config.is_load_child:

            for config_child in configs_child:
                self.load_child(config_child)
Esempio n. 4
0
    def load(self, config, configs_child):
        if config.is_init:
            source = config.experiment.data_params.pop('source')
            if source == 'betas':
                load_betas(config)
                config.base_missed_dict = config.betas_missed_dict
                config.base_data = config.betas_data
                config.target_dict = config.betas_dict
            elif source == 'residuals':
                load_residuals(config)
                config.base_missed_dict = config.residuals_missed_dict
                config.base_data = config.residuals_data
                config.target_dict = config.residuals_dict
            config.base_list = list(config.bops.keys())
            config.base_dict = config.bops

            self.inherit_childs(config, configs_child)

        if config.is_load_child:

            for config_child in configs_child:
                self.load_child(config_child)
 def test_load_residuals_check_shape_cpg_data(self):
     load_residuals(self.config)
     self.assertEqual((300, 729), self.config.residuals_data.shape)
 def test_load_residuals_check_len_cpg_dict(self):
     load_residuals(self.config)
     self.assertEqual(300, len(list(self.config.residuals_dict)))
Esempio n. 7
0
def load_genes(config):

    suffix_gene = ''
    if bool(config.experiment.data_params):
        suffix_gene += '_' + str(config.experiment.get_data_params_str())
        source = config.experiment.data_params.pop('source')
    else:
        raise ValueError('Data params for genes are empty')

    fn_list_txt = get_cache_path(config) + '/' + 'genes_list.txt'
    fn_list_pkl = get_cache_path(config) + '/' + 'genes_list.pkl'
    fn_dict_pkl = get_cache_path(config) + '/' + 'genes_dict.pkl'
    fn_missed_dict_pkl = get_cache_path(config) + '/' + 'genes_missed_dict.pkl'
    fn_data_npz = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.npz'
    fn_data_txt = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.txt'

    if os.path.isfile(fn_dict_pkl) and os.path.isfile(
            fn_list_pkl) and os.path.isfile(fn_data_npz):

        f = open(fn_list_pkl, 'rb')
        config.genes_list = pickle.load(f)
        f.close()

        f = open(fn_dict_pkl, 'rb')
        config.genes_dict = pickle.load(f)
        f.close()

        f = open(fn_missed_dict_pkl, 'rb')
        config.genes_missed_dict = pickle.load(f)
        f.close()

        data = np.load(fn_data_npz)
        config.genes_data = data['data']

    else:

        if source == 'betas':
            load_betas(config)
            source_dict = config.betas_dict
            source_data = config.betas_data
            source_missed_dict = config.betas_missed_dict
        elif source == 'betas_adj':
            load_betas_adj(config)
            source_dict = config.betas_adj_dict
            source_data = config.betas_adj_data
            source_missed_dict = config.betas_adj_missed_dict
        elif source == 'residuals':
            load_residuals(config)
            source_dict = config.residuals_dict
            source_data = config.residuals_data
            source_missed_dict = config.residuals_missed_dict
        else:
            raise ValueError('Source for genes is not specified')

        num_subjects = config.betas_data.shape[1]

        config.genes_list = []
        for gene_id, gene in tqdm(enumerate(config.gene_cpg_dict),
                                  mininterval=60.0,
                                  desc='genes_list creating'):
            cpgs = config.gene_cpg_dict[gene]
            for cpg in cpgs:
                if cpg in source_dict:
                    config.genes_list.append(gene)
                    break

        config.genes_dict = {}
        config.genes_missed_dict = {'any': []}
        config.genes_data = np.zeros((len(config.genes_list), num_subjects),
                                     dtype=np.float32)

        for gene_id, gene in tqdm(enumerate(config.genes_list),
                                  mininterval=60.0,
                                  desc='genes_data creating'):
            config.genes_dict[gene] = gene_id

            cpgs = config.gene_cpg_dict[gene]

            denominators = np.zeros(num_subjects, dtype=np.float32)

            for cpg in cpgs:
                if cpg in source_dict:
                    row_id = source_dict[cpg]
                    source_values_raw = source_data[row_id, :]
                    if len(source_missed_dict[cpg]) > 0:
                        source_values = np.zeros(num_subjects,
                                                 dtype=np.float32)
                        for val_id in range(0, num_subjects):
                            if val_id not in source_missed_dict[cpg]:
                                source_values[val_id] = source_values_raw[
                                    val_id]
                                denominators[val_id] += 1.0

                    else:
                        source_values = source_values_raw
                        denominators += np.ones(num_subjects, dtype=np.float32)
                    config.genes_data[gene_id] += source_values

            for val_id in range(0, num_subjects):
                config.genes_data[gene_id][val_id] /= denominators[val_id]

        f = open(fn_list_pkl, 'wb')
        pickle.dump(config.genes_list, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        f = open(fn_dict_pkl, 'wb')
        pickle.dump(config.genes_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        f = open(fn_missed_dict_pkl, 'wb')
        pickle.dump(config.genes_missed_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        np.savez_compressed(fn_data_npz, data=config.genes_data)
        np.savetxt(fn_data_txt, config.genes_data, delimiter='\t', fmt='%.8e')

        with open(fn_list_txt, 'w') as f:
            for item in config.genes_list:
                f.write("%s\n" % item)
Esempio n. 8
0
def load_betas_adj(config):

    suffix = ''
    if bool(config.experiment.data_params):
        suffix += '_' + config.experiment.get_data_params_str()
    else:
        raise ValueError('Exog for residuals is empty.')

    fn_dict = get_data_base_path(
        config) + '/' + 'betas_adj_dict' + suffix + '.pkl'
    fn_missed_dict = get_data_base_path(
        config) + '/' + 'betas_adj_missed_dict' + suffix + '.pkl'
    fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz'

    if os.path.isfile(fn_dict) and os.path.isfile(fn_data):

        f = open(fn_dict, 'rb')
        config.betas_adj_dict = pickle.load(f)
        f.close()

        f = open(fn_missed_dict, 'rb')
        config.betas_adj_missed_dict = pickle.load(f)
        f.close()

        data = np.load(fn_data)
        config.betas_adj_data = data['data']

    else:
        load_residuals(config)

        config.experiment.data_params = {}
        load_betas(config)

        config.betas_adj_dict = config.residuals_dict
        f = open(fn_dict, 'wb')
        pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        config.betas_adj_missed_dict = config.residuals_missed_dict
        f = open(fn_missed_dict, 'wb')
        pickle.dump(config.betas_missed_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        num_cpgs = config.betas_data.shape[0]
        num_subjects = config.betas_data.shape[1]
        config.betas_adj_data = np.zeros((num_cpgs, num_subjects),
                                         dtype=np.float32)

        for cpg in tqdm(config.betas_adj_dict,
                        mininterval=60.0,
                        desc='betas_adj_data creating'):

            residuals = config.residuals_data[config.residuals_dict[cpg], :]
            betas = config.betas_data[config.betas_dict[cpg], :]

            min_residuals = np.min(residuals)
            mean_betas = np.mean(betas)

            shift = mean_betas
            if min_residuals + shift < 0:
                shift = abs(min_residuals)

            betas_adj = residuals + shift

            config.betas_adj_data[config.residuals_dict[cpg]] = betas_adj

        np.savez_compressed(fn_data, data=config.betas_adj_data)

        # Clear data
        del config.residuals_data
        del config.betas_data