Example #1
0
def load_epimutations(config):
    fn_dict = get_data_base_path(config) + '/' + 'epimutations_dict'

    suffix = ''
    if bool(config.experiment.data_params):
        suffix += '_' + str(config.experiment.get_data_params_str())

    fn_data = get_data_base_path(config) + '/' + 'epimutations' + suffix

    if os.path.isfile(fn_dict + '.pkl') and os.path.isfile(fn_data + '.npz'):

        f = open(fn_dict + '.pkl', 'rb')
        config.epimutations_dict = pickle.load(f)
        f.close()

        data = np.load(fn_data + '.npz')
        config.epimutations_data = data['data']

    else:

        config.experiment.data_params = {}
        load_betas(config)

        config.epimutations_dict = config.betas_dict
        f = open(fn_dict + '.pkl', 'wb')
        pickle.dump(config.epimutations_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        save_table_dict_csv(
            fn_dict,
            {
                'item': list(config.epimutations_dict.keys()),
                'row': list(config.epimutations_dict.values())
            }
        )

        num_cpgs = config.betas_data.shape[0]
        num_subjects = config.betas_data.shape[1]
        config.epimutations_data = np.zeros((num_cpgs, num_subjects), dtype=np.int)

        for cpg, row in tqdm(config.betas_dict.items(), mininterval=60.0):
            betas = config.betas_data[row, :]
            quartiles = np.percentile(betas, [25, 75])
            iqr = quartiles[1] - quartiles[0]
            left = quartiles[0] - (3.0 * iqr)
            right = quartiles[1] + (3.0 * iqr)

            curr_row = np.zeros(num_subjects, dtype=np.int)
            for subject_id in range(0, num_subjects):
                curr_point = betas[subject_id]
                if curr_point < left or curr_point > right:
                    curr_row[subject_id] = 1

            config.epimutations_data[row] = curr_row

        np.savez_compressed(fn_data + '.npz', data=config.epimutations_data)
        np.savetxt(fn_data + '.txt', config.epimutations_data, delimiter='\t', fmt='%d')

        # Clear data
        del config.betas_data
Example #2
0
    def test_load_residuals_check_files_creation(self):
        suffix = '_' + self.config.experiment.get_data_params_str()
        fn_dict = get_data_base_path(self.config) + '/' + 'residuals_common_dict.pkl'
        fn_data = get_data_base_path(self.config) + '/' + 'residuals_common' + suffix + '.npz'

        load_residuals_common(self.config)

        self.assertEqual(True, os.path.isfile(fn_dict) and os.path.isfile(fn_data))
Example #3
0
def load_betas(config):
    fn_dict = get_data_base_path(config) + '/' + 'betas_dict.pkl'

    suffix = ''
    if bool(config.experiment.data_params):
        suffix += '_' + str(config.experiment.get_data_params_str())

    fn_data = get_data_base_path(config) + '/' + 'betas' + suffix
    fn_txt = fn_data + '.txt'
    fn_npz = fn_data + '.npz'

    if os.path.isfile(fn_dict) and os.path.isfile(fn_npz):

        f = open(fn_dict, 'rb')
        config.betas_dict = pickle.load(f)
        f.close()

        data = np.load(fn_npz)
        config.betas_data = data['data']

    else:

        config.betas_dict = {}

        f = open(fn_txt)
        f.readline()
        cpg_id = 0
        for line in tqdm(f, mininterval=60.0, desc='betas_dict creating'):
            line_list = get_line_list(line)
            cpg = line_list[0]
            config.betas_dict[cpg] = cpg_id
            cpg_id += 1
        f.close()

        f = open(fn_dict, 'wb')
        pickle.dump(config.betas_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        num_cpgs = cpg_id

        f = open(fn_txt)
        header_line = f.readline()
        headers = header_line.split('\t')
        headers = [x.rstrip() for x in headers]
        subjects = headers[1:len(headers)]

        config.betas_data = np.zeros((num_cpgs, len(subjects)),
                                     dtype=np.float32)

        cpg_id = 0
        for line in tqdm(f, mininterval=60.0, desc='betas_data creating'):
            line_list = get_line_list(line)
            curr_data = list(map(np.float32, line_list[1::]))
            config.betas_data[cpg_id] = curr_data
            cpg_id += 1
        f.close()

        np.savez_compressed(fn_npz, data=config.betas_data)
Example #4
0
    def test_load_cpg_check_files_creation(self):
        fn_dict = get_data_base_path(self.config) + '/' + 'betas_dict.pkl'
        fn_data = get_data_base_path(self.config) + '/' + 'betas'
        fn_npz = fn_data + '.npz'

        load_betas(self.config)

        self.assertEqual(True,
                         os.path.isfile(fn_dict) and os.path.isfile(fn_npz))
Example #5
0
def load_betas_spec(config):

    file = config.experiment.data_params.pop('file', None)
    filename, file_extension = os.path.splitext(file)
    fn_list = get_data_base_path(config) + '/' + file
    fn_data_csv = get_data_base_path(config) + '/' + filename + '.csv'

    if os.path.isfile(fn_list):
        f = open(fn_list)
        cpgs = f.readlines()
        cpgs = [x.rstrip() for x in cpgs]
        f.close()
    else:
        raise ValueError('File with cpgs is not exists.')

    if not os.path.isfile(fn_data_csv):

        indexes = config.attributes_indexes

        fn_betas = get_data_base_path(config) + '/' + 'betas.txt'
        f = open(fn_betas)
        header_line = f.readline()
        headers = get_line_list(header_line)
        subject_headers = list(np.array(headers[1::])[indexes])
        f.close()

        load_betas(config)

        num_cols = len(subject_headers) + 1  # header col
        num_rows = len(cpgs) + 1  # header row

        betas = np.zeros((num_rows, num_cols), dtype=object)
        row_id = 0
        betas[row_id] = ['ProbeID'] + subject_headers
        row_id += 1

        for cpg in tqdm(cpgs, mininterval=60.0, desc='betas_spec creating'):
            if cpg in config.betas_dict:
                cpg_row_id = config.betas_dict[cpg]
                curr_betas = list(
                    np.array(config.betas_data[cpg_row_id])[indexes])
                for missed_id in config.betas_missed_dict[cpg]:
                    curr_betas[missed_id] = 'NaN'
                line = [cpg] + curr_betas
                betas[row_id] = line
            else:
                line = [cpg] + ['NaN'] * (num_cols - 1)
                betas[row_id] = line

            row_id += 1

        pd.DataFrame(betas).to_csv(fn_data_csv, index=False, header=False)
Example #6
0
def load_cpg(config):
    fn_dict = get_data_base_path(config) + '/' + 'cpg_dict.pkl'
    fn_data = get_data_base_path(config) + '/' + config.data.name
    fn_txt = fn_data + '.txt'
    fn_npz = fn_data + '.npz'

    if os.path.isfile(fn_dict) and os.path.isfile(fn_npz):

        f = open(fn_dict, 'rb')
        config.cpg_dict = pickle.load(f)
        f.close()

        data = np.load(fn_npz)
        config.cpg_data = data['cpg_data']

    else:

        config.cpg_dict = {}

        f = open(fn_txt)
        f.readline()
        cpg_id = 0
        for line in f:
            line_list = get_line_list(line)
            cpg = line_list[0]
            config.cpg_dict[cpg] = cpg_id
            cpg_id += 1
        f.close()

        f = open(fn_dict, 'wb')
        pickle.dump(config.cpg_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        num_cpgs = cpg_id

        f = open(fn_txt)
        header_line = f.readline()
        headers = header_line.split('\t')
        headers = [x.rstrip() for x in headers]
        subjects = headers[1:len(headers)]

        config.cpg_data = np.zeros((num_cpgs, len(subjects)), dtype=np.float32)

        cpg_id = 0
        for line in f:
            line_list = get_line_list(line)
            curr_data = list(map(np.float32, line_list[1::]))
            config.cpg_data[cpg_id] = curr_data
            cpg_id += 1
        f.close()

        np.savez_compressed(fn_npz, cpg_data=config.cpg_data)
Example #7
0
def load_betas_horvath_calculator(config):

    fn_list = get_data_base_path(config) + '/' + 'cpgs_horvath_calculator.txt'

    fn_data_csv = get_data_base_path(config) + '/' + 'betas_horvath_calculator.csv'

    suffix = ''
    if bool(config.experiment.data_params):
        suffix += '_' + config.experiment.get_data_params_str()

    if os.path.isfile(fn_list):
        f = open(fn_list)
        cpgs = f.readlines()
        cpgs = [x.rstrip() for x in cpgs]
        f.close()
    else:
        raise ValueError('No specified file with cpgs for Horvath\'s calculator.')

    if not os.path.isfile(fn_data_csv):

        fn_betas = get_data_base_path(config) + '/' + 'betas' + suffix + '.txt'

        f = open(fn_betas)
        header_line = f.readline()
        headers = get_line_list(header_line)

        load_betas(config)

        num_cols = len(headers)
        num_rows = len(cpgs) + 1  # header row

        betas = np.zeros((num_rows, num_cols), dtype=object)
        row_id = 0
        betas[row_id] = ['ProbeID'] + headers[1::]
        row_id += 1

        for cpg in tqdm(cpgs, mininterval=60.0, desc='betas_horvath_calculator creating'):
            if cpg in config.betas_dict:
                cpg_row_id = config.betas_dict[cpg]
                curr_betas = list(config.betas_data[cpg_row_id])
                for missed_id in config.betas_missed_dict[cpg]:
                    curr_betas[missed_id] = 'NaN'
                line = [cpg] + curr_betas
                betas[row_id] = line
            else:
                line = [cpg] + ['NaN'] * (num_cols - 1)
                betas[row_id] = line

            row_id += 1

        pd.DataFrame(betas).to_csv(fn_data_csv, index=False, header=False)
Example #8
0
def load_excluded(config):
    exclude = []

    if config.annotations.exclude != CommonTypes.none.value:
        fn = get_data_base_path(config) + '/' + config.annotations.exclude
        fn_txt = fn + '.txt'
        fn_pkl = fn + '.pkl'

        if os.path.isfile(fn_pkl):

            f = open(fn_pkl, 'rb')
            exclude = pickle.load(f)
            f.close()

        else:
            f = open(fn_txt)
            exclude = f.readlines()
            exclude = [x.rstrip() for x in exclude]
            f.close()

            f = open(fn_pkl, 'wb')
            pickle.dump(exclude, f, pickle.HIGHEST_PROTOCOL)
            f.close()

    return exclude
Example #9
0
def clear_cache(config):
    path = get_data_base_path(config)
    exts = ('.npz', '.pkl')
    for root, dirs, files in os.walk(path):
        for currentFile in files:
            if currentFile.lower().endswith(exts):
                os.remove(os.path.join(root, currentFile))
Example #10
0
    def test_load_annotations_check_pkl_file_creation(self):
        load_annotations_dict(self.config)

        create = os.path.isfile(
            get_data_base_path(self.config) + '/' +
            self.config.annotations.name + '.pkl')

        self.assertEqual(True, create)
Example #11
0
    def test_load_excluded_check_pkl_creation(self):
        self.config.annotations.exclude = 'excluded'
        fn = get_data_base_path(
            self.config) + '/' + self.config.annotations.exclude + '.pkl'

        self.config.excluded = load_excluded(self.config)

        self.assertEqual(True, os.path.isfile(fn))
Example #12
0
    def test_load_observables_dict_check_pkl_file_creation(self):
        load_observables_dict(self.config)

        create = os.path.isfile(
            get_data_base_path(self.config) + '/' +
            self.config.attributes.observables.name + '.pkl')

        self.assertEqual(True, create)
Example #13
0
def load_observables_dict(config):
    fn = get_data_base_path(config) + '/' + config.attributes.observables.name
    fn_txt = fn + '.txt'
    fn_xlsx = fn + '.xlsx'
    fn_pkl = fn + '.pkl'

    if os.path.isfile(fn_pkl):

        f = open(fn_pkl, 'rb')
        observables_dict = pickle.load(f)
        f.close()

    else:

        if os.path.isfile(fn_xlsx):
            df = pd.read_excel(fn_xlsx)
            tmp_dict = df.to_dict()
            observables_dict = {}
            for key in tmp_dict:
                curr_dict = tmp_dict[key]
                observables_dict[key] = list(curr_dict.values())

        elif os.path.isfile(fn_txt):
            f = open(fn_txt)
            key_line = f.readline()
            keys = key_line.split('\t')
            keys = [x.rstrip() for x in keys]

            observables_dict = {}
            for key in keys:
                observables_dict[key] = []

            for line in f:
                values = line.split('\t')
                for key_id in range(0, len(keys)):
                    key = keys[key_id]
                    value = values[key_id].rstrip()
                    if is_float(value):
                        value = float(value)
                        if value.is_integer():
                            observables_dict[key].append(int(value))
                        else:
                            observables_dict[key].append(float(value))
                    else:
                        observables_dict[key].append(value)
            f.close()

        else:
            raise ValueError('No observables file')

        f = open(fn_pkl, 'wb')
        pickle.dump(observables_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    return observables_dict
Example #14
0
def load_cells_dict(config):
    fn = get_data_base_path(config) + '/' + config.attributes.cells.name
    fn_txt = fn + '.txt'
    fn_pkl = fn + '.pkl'

    if os.path.isfile(fn_pkl):

        f = open(fn_pkl, 'rb')
        cells_dict = pickle.load(f)
        f.close()

    else:

        f = open(fn_txt)
        key_line = f.readline()
        keys = key_line.split('\t')
        # First column is always sample name
        keys = [x.rstrip() for x in keys][1::]

        if isinstance(config.attributes.cells.types, list):
            possible_keys = config.attributes.cells.types
        elif config.attributes.cells.types == CommonTypes.any.value:
            possible_keys = keys
        else:
            possible_keys = []

        cells_dict = {}
        for key in keys:
            if key in possible_keys:
                cells_dict[key] = []

        for line in f:
            values = line.split('\t')[1::]
            for key_id in range(0, len(keys)):
                key = keys[key_id]
                if key in possible_keys:
                    value = values[key_id].rstrip()
                    if is_float(value):
                        cells_dict[key].append(float(value))
                    else:
                        cells_dict[key].append(value)
        f.close()

        f = open(fn_pkl, 'wb')
        pickle.dump(cells_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    return cells_dict
Example #15
0
def load_observables_categorical_dict(config):
    fn = get_data_base_path(config) + '/' + config.attributes.observables.name + '_categorical'
    fn_pkl = fn + '.pkl'

    if os.path.isfile(fn_pkl):

        f = open(fn_pkl, 'rb')
        observables_categorical_dict = pickle.load(f)
        f.close()

    else:

        observables_categorical_dict = {}

        if config.observables_dict is not None:
            observables_dict = config.observables_dict
        else:
            observables_dict = load_observables_dict(config)

        na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>',
                     'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null', '-', '--']

        for key in observables_dict:
            all_numeric = True
            for i in range(0, len(observables_dict[key])):
                value = observables_dict[key][i]
                if value in na_values:
                    value = np.nan
                if is_float(value):
                    value = float(value)
                    if value.is_integer():
                        observables_dict[key][i] = value
                    else:
                        observables_dict[key][i] = float(value)
                else:
                    observables_dict[key][i] = value
                    all_numeric = False
            if all_numeric:
                observables_categorical_dict[key] = np.asarray(config.observables_dict[key])
            else:
                observables_categorical_dict[key] = categorize_data(np.asarray(config.observables_dict[key]))

        f = open(fn_pkl, 'wb')
        pickle.dump(observables_categorical_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    return observables_categorical_dict
Example #16
0
def load_attributes_dict(config):
    fn = get_data_base_path(config) + '/' + \
         config.attributes.observables.name
    fn_txt = fn + '.txt'
    fn_pkl = fn + '.pkl'

    if os.path.isfile(fn_pkl):

        f = open(fn_pkl, 'rb')
        attributes_dict = pickle.load(f)
        f.close()

    else:

        f = open(fn_txt)
        key_line = f.readline()
        keys = key_line.split('\t')
        keys = [x.rstrip() for x in keys]
        possible_keys = list(config.attributes.observables.types.keys())
        possible_keys = [config.attributes.target] + possible_keys

        attributes_dict = {}
        for key in keys:
            if key in possible_keys:
                attributes_dict[key] = []

        for line in f:
            values = line.split('\t')
            for key_id in range(0, len(keys)):
                key = keys[key_id]
                if key in possible_keys:
                    value = values[key_id].rstrip()
                    if is_float(value):
                        value = float(value)
                        if value.is_integer():
                            attributes_dict[key].append(int(value))
                        else:
                            attributes_dict[key].append(float(value))
                    else:
                        attributes_dict[key].append(value)
        f.close()

        f = open(fn_pkl, 'wb')
        pickle.dump(attributes_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    return attributes_dict
Example #17
0
def load_cells_dict(config):
    fn = get_data_base_path(config) + '/' + config.attributes.cells.name
    fn_txt = fn + '.txt'
    fn_pkl = fn + '.pkl'

    if os.path.isfile(fn_pkl):

        f = open(fn_pkl, 'rb')
        cells_dict = pickle.load(f)
        f.close()

    elif not os.path.isfile(fn_txt):

        return None

    else:

        f = open(fn_txt)
        key_line = f.readline()
        keys = key_line.split('\t')
        keys = [x.rstrip() for x in keys]

        cells_dict = {}
        for key in keys:
            cells_dict[key] = []

        for line in f:
            values = line.split('\t')
            for key_id in range(0, len(keys)):
                key = keys[key_id]
                value = values[key_id].rstrip()
                if is_float(value):
                    cells_dict[key].append(float(value))
                else:
                    cells_dict[key].append(value)
        f.close()

        f = open(fn_pkl, 'wb')
        pickle.dump(cells_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    return cells_dict
Example #18
0
def load_annotations_dict(config):
    fn = get_data_base_path(config) + '/' + config.annotations.name
    fn_txt = fn + '.txt'
    fn_pkl = fn + '.pkl'

    if os.path.isfile(fn_pkl):

        f = open(fn_pkl, 'rb')
        annotations_dict = pickle.load(f)
        f.close()

    else:

        f = open(fn_txt)
        key_line = f.readline()
        keys = key_line.split('\t')
        keys = [x.rstrip() for x in keys]

        annotations_dict = {}
        for key in keys:
            annotations_dict[key] = []

        for line in f:
            values = line.split('\t')
            for key_id, key in enumerate(keys):
                values_for_key = values[key_id].rstrip()
                if values_for_key == '':
                    annotations_dict[key].append([])
                else:
                    values_for_key = values_for_key.split(';')
                    values_for_key = list(set(values_for_key))
                    annotations_dict[key].append(values_for_key)
        f.close()

        f = open(fn_pkl, 'wb')
        pickle.dump(annotations_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    return annotations_dict
Example #19
0
def load_annotations_dict(config):
    fn = get_data_base_path(config) + '/' + config.annotations.name
    fn_txt = fn + '.txt'
    fn_pkl = fn + '.pkl'

    if os.path.isfile(fn_pkl):

        f = open(fn_pkl, 'rb')
        annotations_dict = pickle.load(f)
        f.close()

    else:

        possible_keys = [x.value for x in AnnotationKey]
        f = open(fn_txt)
        key_line = f.readline()
        keys = key_line.split('\t')
        keys = [x.rstrip() for x in keys]

        annotations_dict = {}
        for key in keys:
            if key in possible_keys:
                annotations_dict[key] = []

        for line in f:
            values = line.split('\t')
            for key_id in range(0, len(keys)):
                key = keys[key_id]
                if key in possible_keys:
                    annotations_dict[key].append(values[key_id].rstrip())
        f.close()

        f = open(fn_pkl, 'wb')
        pickle.dump(annotations_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    return annotations_dict
Example #20
0
def load_epimutations(config):

    suffix = ''
    if bool(config.experiment.data_params):
        suffix += '_' + str(config.experiment.get_data_params_str())

    fn_data = get_data_base_path(config) + '/' + 'epimutations' + suffix

    config.epimutations_list = ['epimutations']
    config.epimutations_dict = {'epimutations': 0}
    config.epimutations_missed_dict = {'epimutations': []}

    data_params_copy = copy.deepcopy(config.experiment.data_params)
    common_keys = ['part', 'norm']
    config.experiment.data_params = {}
    for key in common_keys:
        if key in data_params_copy:
            config.experiment.data_params[key] = data_params_copy[key]
    load_betas(config)

    if os.path.isfile(fn_data + '.npz'):

        data = np.load(fn_data + '.npz')
        config.epimutations_data = data['data']

    else:

        num_cpgs = config.betas_data.shape[0]
        num_subjects = config.betas_data.shape[1]
        config.epimutations_data = np.zeros((num_cpgs, num_subjects),
                                            dtype=np.int)

        for cpg, row in tqdm(config.betas_dict.items(),
                             mininterval=60.0,
                             desc='epimutations_data creating'):

            betas_raw = config.betas_data[row, :]
            if len(config.betas_missed_dict[cpg]) > 0:
                betas = []
                for beta_id in range(0, len(betas_raw)):
                    if beta_id not in config.betas_missed_dict[cpg]:
                        betas.append(betas_raw[beta_id])
            else:
                betas = betas_raw

            quartiles = np.percentile(betas, [25, 75])
            iqr = quartiles[1] - quartiles[0]
            left = quartiles[0] - (3.0 * iqr)
            right = quartiles[1] + (3.0 * iqr)

            curr_row = np.zeros(num_subjects, dtype=np.int)
            for subject_id in range(0, num_subjects):
                curr_point = betas[subject_id]
                if not math.isnan(curr_point):
                    if curr_point < left or curr_point > right:
                        curr_row[subject_id] = 1

            config.epimutations_data[row] = curr_row

        np.savez_compressed(fn_data + '.npz', data=config.epimutations_data)
        np.savetxt(fn_data + '.txt',
                   config.epimutations_data,
                   delimiter='\t',
                   fmt='%d')

    # Clear data
    del config.betas_data
Example #21
0
def load_resid_old(config):

    suffix = ''
    if bool(config.experiment.data_params):
        data_params = config.experiment.data_params
        suffix += '_' + config.experiment.get_data_params_str()
    else:
        raise ValueError('Exog for resid_old is empty.')

    fn_dict = get_data_base_path(
        config) + '/' + 'resid_old_dict' + suffix + '.pkl'
    fn_missed_dict = get_data_base_path(
        config) + '/' + 'resid_old_missed_dict' + suffix + '.pkl'
    fn_data = get_data_base_path(config) + '/' + 'resid_old' + suffix + '.npz'

    if os.path.isfile(fn_dict) and os.path.isfile(fn_data):

        f = open(fn_dict, 'rb')
        config.resid_old_dict = pickle.load(f)
        f.close()

        f = open(fn_missed_dict, 'rb')
        config.resid_old_missed_dict = pickle.load(f)
        f.close()

        data = np.load(fn_data)
        config.resid_old_data = data['data']

    else:

        data_params_copy = copy.deepcopy(config.experiment.data_params)
        common_keys = ['norm']
        config.experiment.data_params = {}
        for key in common_keys:
            if key in data_params_copy:
                config.experiment.data_params[key] = data_params_copy[key]

        load_betas(config)

        config.resid_old_dict = config.betas_dict
        f = open(fn_dict, 'wb')
        pickle.dump(config.resid_old_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        config.resid_old_missed_dict = config.betas_missed_dict
        f = open(fn_missed_dict, 'wb')
        pickle.dump(config.resid_old_missed_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        exog_dict = {}

        if 'cells' in data_params:

            cells_dict = load_cells_dict(config)

            if isinstance(data_params['cells'], list):
                all_types = list(cells_dict.keys())
                for key in all_types:
                    if key not in data_params['cells']:
                        cells_dict.pop(key)

                if len(list(cells_dict.keys())) != len(data_params['cells']):
                    raise ValueError('Wrong number of cells types.')

                exog_dict.update(cells_dict)

        if 'observables' in data_params:
            observables_categorical_dict = load_observables_categorical_dict(
                config)
            if isinstance(data_params['observables'], list):
                all_types = list(observables_categorical_dict.keys())
                for key in all_types:
                    if key not in data_params['observables']:
                        observables_categorical_dict.pop(key)

                if len(list(observables_categorical_dict.keys())) != len(
                        data_params['observables']):
                    raise ValueError('Wrong number of observables types.')

                exog_dict.update(observables_categorical_dict)

        num_cpgs = config.betas_data.shape[0]
        num_subjects = config.betas_data.shape[1]
        config.resid_old_data = np.zeros((num_cpgs, num_subjects),
                                         dtype=np.float32)

        for cpg, row in tqdm(config.betas_dict.items(),
                             mininterval=60.0,
                             desc='resid_old_data creating'):
            raw_betas = config.betas_data[row, :]

            current_exog_dict = copy.deepcopy(exog_dict)

            if len(config.betas_missed_dict[cpg]) > 0:

                for key in current_exog_dict:
                    values = []
                    for value_id in range(0, len(current_exog_dict[key])):
                        if value_id not in config.betas_missed_dict[cpg]:
                            values.append(current_exog_dict[key][value_id])
                    current_exog_dict[key] = values

                betas = []
                passed_ids = []
                for beta_id in range(0, len(raw_betas)):
                    if beta_id not in config.betas_missed_dict[cpg]:
                        betas.append(raw_betas[beta_id])
                        passed_ids.append(beta_id)
            else:

                betas = raw_betas
                passed_ids = list(range(0, len(betas)))

            endog_dict = {cpg: betas}
            endog_df = pd.DataFrame(endog_dict)
            exog_df = pd.DataFrame(current_exog_dict)

            reg_res = sm.OLS(endog=endog_df, exog=exog_df).fit()

            resid_old = list(map(np.float32, reg_res.resid))
            resid_old_raw = np.zeros(num_subjects, dtype=np.float32)
            for beta_id in range(0, len(passed_ids)):
                resid_old_raw[passed_ids[beta_id]] = resid_old[beta_id]

            for missed_id in config.resid_old_missed_dict[cpg]:
                resid_old_raw[missed_id] = np.float32('nan')

            config.resid_old_data[row] = resid_old_raw

        np.savez_compressed(fn_data, data=config.resid_old_data)

        # Clear data
        del config.betas_data
Example #22
0
def load_betas_adj(config):

    fn_dict = get_data_base_path(config) + '/' + 'betas_adj_dict.pkl'

    suffix = ''
    if bool(config.experiment.data_params):
        data_params = config.experiment.data_params
        suffix += '_' + config.experiment.get_data_params_str()
    else:
        raise ValueError(f'Exog for residuals is empty.')

    fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz'

    if os.path.isfile(fn_dict) and os.path.isfile(fn_data):

        f = open(fn_dict, 'rb')
        config.betas_adj_dict = pickle.load(f)
        f.close()

        data = np.load(fn_data)
        config.betas_adj_data = data['data']

    else:

        config.experiment.data_params = {}
        load_betas(config)

        config.betas_adj_dict = config.betas_dict
        f = open(fn_dict, 'wb')
        pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        exog_dict = {}

        if 'cells' in data_params:

            cells_dict = load_cells_dict(config)

            if isinstance(data_params['cells'], list):
                all_types = list(cells_dict.keys())
                for key in all_types:
                    if key not in data_params['cells']:
                        cells_dict.pop(key)

                if len(list(cells_dict.keys())) != len(data_params['cells']):
                    raise ValueError(f'Wrong number of cells types.')

                exog_dict.update(cells_dict)

        if 'observables' in data_params:

            observables_dict = load_observables_dict(config)
            if isinstance(data_params['observables'], list):
                all_types = list(observables_dict.keys())
                for key in all_types:
                    if key not in data_params['observables']:
                        observables_dict.pop(key)

                if len(list(observables_dict.keys())) != len(
                        data_params['observables']):
                    raise ValueError(f'Wrong number of observables types.')

                exog_dict.update(observables_dict)

        exog_df = pd.DataFrame(exog_dict)

        num_cpgs = config.betas_data.shape[0]
        num_subjects = config.betas_data.shape[1]
        config.betas_adj_data = np.zeros((num_cpgs, num_subjects),
                                         dtype=np.float32)

        for cpg, row in tqdm(config.betas_dict.items(),
                             mininterval=60.0,
                             desc='betas_adj_data creating'):
            betas = config.betas_data[row, :]

            mean = np.mean(betas)

            endog_dict = {cpg: betas}
            endog_df = pd.DataFrame(endog_dict)

            reg_res = sm.OLS(endog=endog_df, exog=exog_df).fit()

            residuals = list(map(np.float32, reg_res.resid))
            betas_adj = residuals + mean
            config.betas_adj_data[row] = betas_adj

        np.savez_compressed(fn_data, data=config.betas_adj_data)

        # Clear data
        del config.betas_data
Example #23
0
def load_betas(config):

    suffix = ''
    if bool(config.experiment.data_params):
        suffix += '_' + str(config.experiment.get_data_params_str())

    fn_dict = get_data_base_path(config) + '/' + 'betas_dict' + suffix + '.pkl'
    fn_missed_dict = get_data_base_path(config) + '/' + 'betas_missed_dict' + suffix + '.pkl'
    fn_data = get_data_base_path(config) + '/' + 'betas' + suffix
    fn_txt = fn_data + '.txt'
    fn_npz = fn_data + '.npz'

    if os.path.isfile(fn_dict) and os.path.isfile(fn_dict) and os.path.isfile(fn_npz):

        f = open(fn_dict, 'rb')
        config.betas_dict = pickle.load(f)
        f.close()

        f = open(fn_missed_dict, 'rb')
        config.betas_missed_dict = pickle.load(f)
        f.close()

        data = np.load(fn_npz)
        config.betas_data = data['data']

    else:

        config.betas_dict = {}
        config.betas_missed_dict = {}
        config.betas_missed_dict['any'] = []

        f = open(fn_txt)
        f.readline()
        cpg_id = 0
        for line in tqdm(f, mininterval=60.0, desc='betas_dict creating'):
            line_list = get_line_list(line)
            cpg = line_list[0]
            betas = line_list[1::]

            missed_indexes = []
            for missed_value in config.annotations.missed_values:
                indexes = [i for i, x in enumerate(betas) if x == missed_value]
                missed_indexes += indexes
            missed_indexes.sort()
            config.betas_missed_dict[cpg] = missed_indexes

            config.betas_dict[cpg] = cpg_id
            cpg_id += 1
        f.close()

        f = open(fn_dict, 'wb')
        pickle.dump(config.betas_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        f = open(fn_missed_dict, 'wb')
        pickle.dump(config.betas_missed_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        num_cpgs = cpg_id

        f = open(fn_txt)
        header_line = f.readline()
        headers = get_line_list(header_line)
        subjects = headers[1:len(headers)]

        config.betas_data = np.zeros((num_cpgs, len(subjects)), dtype=np.float32)

        cpg_id = 0
        for line in tqdm(f, mininterval=60.0, desc='betas_data creating'):
            line_list = get_line_list(line)
            cpg = line_list[0]
            betas = line_list[1::]
            for beta_id in range(0, len(betas)):
                if beta_id in config.betas_missed_dict[cpg]:
                    betas[beta_id] = np.float32('nan')
                else:
                    betas[beta_id] = np.float32(betas[beta_id])
            config.betas_data[cpg_id] = betas
            cpg_id += 1
        f.close()

        np.savez_compressed(fn_npz, data=config.betas_data)
Example #24
0
def load_betas_adj(config):

    suffix = ''
    if bool(config.experiment.data_params):
        suffix += '_' + config.experiment.get_data_params_str()
    else:
        raise ValueError('Exog for residuals is empty.')

    fn_dict = get_data_base_path(
        config) + '/' + 'betas_adj_dict' + suffix + '.pkl'
    fn_missed_dict = get_data_base_path(
        config) + '/' + 'betas_adj_missed_dict' + suffix + '.pkl'
    fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz'

    if os.path.isfile(fn_dict) and os.path.isfile(fn_data):

        f = open(fn_dict, 'rb')
        config.betas_adj_dict = pickle.load(f)
        f.close()

        f = open(fn_missed_dict, 'rb')
        config.betas_adj_missed_dict = pickle.load(f)
        f.close()

        data = np.load(fn_data)
        config.betas_adj_data = data['data']

    else:
        load_residuals(config)

        config.experiment.data_params = {}
        load_betas(config)

        config.betas_adj_dict = config.residuals_dict
        f = open(fn_dict, 'wb')
        pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        config.betas_adj_missed_dict = config.residuals_missed_dict
        f = open(fn_missed_dict, 'wb')
        pickle.dump(config.betas_missed_dict, f, pickle.HIGHEST_PROTOCOL)
        f.close()

        num_cpgs = config.betas_data.shape[0]
        num_subjects = config.betas_data.shape[1]
        config.betas_adj_data = np.zeros((num_cpgs, num_subjects),
                                         dtype=np.float32)

        for cpg in tqdm(config.betas_adj_dict,
                        mininterval=60.0,
                        desc='betas_adj_data creating'):

            residuals = config.residuals_data[config.residuals_dict[cpg], :]
            betas = config.betas_data[config.betas_dict[cpg], :]

            min_residuals = np.min(residuals)
            mean_betas = np.mean(betas)

            shift = mean_betas
            if min_residuals + shift < 0:
                shift = abs(min_residuals)

            betas_adj = residuals + shift

            config.betas_adj_data[config.residuals_dict[cpg]] = betas_adj

        np.savez_compressed(fn_data, data=config.betas_adj_data)

        # Clear data
        del config.residuals_data
        del config.betas_data