Example #1
0
    def generate_test_bench(self, count_file_path, **kwargs):
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)

        count_matrix, classes = self._load_data()

        # Remove zero rows
        count_matrix = count_matrix[np.sum(count_matrix, axis=1) > 0].copy()

        # Shuffle columns
        count_matrix, original_columns, column_permutation = \
            shuffle_and_rename_columns(count_matrix, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            count_matrix.to_sparse(), classes, original_columns,
            column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(count_matrix, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Example #2
0
    def generate_test_bench(self, count_file_path, **kwargs):
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)

        count_rna = self.data_set.get("RNA")
        count_adt = self.data_set.get("ADT")

        # Shuffle columns
        count_rna, original_columns, column_permutation = \
            shuffle_and_rename_columns(count_rna, disabled=preserve_columns)

        # Remove zero rows
        count_rna = count_rna[np.sum(count_rna, axis=1) > 0].copy()

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            count_rna.to_sparse(), original_columns, column_permutation,
            count_adt.to_sparse(), self.protein_rna_mapping
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(count_rna, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Example #3
0
    def generate_test_bench(self, count_file_path, **kwargs):
        n_samples = kwargs['n_samples']
        dropout_count = kwargs['dropout_count']
        min_expression = kwargs['min_expression']
        hvg_frac = kwargs['hvg_frac']
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)
        data = self._load_data(n_samples)

        hvg_indices = self.get_hvg_genes(data, hvg_frac)

        # Generate elimination mask
        non_zero_locations = []

        data_values = data.values
        for x in hvg_indices:
            for y in range(data.shape[1]):
                if data_values[x, y] >= min_expression:
                    non_zero_locations.append((x, y))
        del data_values

        mask = np.zeros_like(data)

        masked_locations = [
            non_zero_locations[index] for index in np.random.choice(
                len(non_zero_locations), dropout_count, replace=False)
        ]

        for (x, y) in masked_locations:
            mask[x, y] = 1

        mask = pd.DataFrame(mask, index=data.index, columns=data.columns)

        # Elimination
        low_quality_data = data * (1 - mask.values)

        is_nonzero = np.sum(low_quality_data, axis=1) > 0
        mask = mask[is_nonzero].copy()
        data = data[is_nonzero].copy()
        low_quality_data = low_quality_data[is_nonzero].copy()

        # Shuffle columns
        low_quality_data, original_columns, column_permutation = \
            shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            data.to_sparse(),
            mask.to_sparse(), original_columns, column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(low_quality_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Example #4
0
    def generate_test_bench(self, count_file_path, **kwargs):
        n_samples = kwargs['n_samples']
        read_ratio = kwargs['read_ratio']
        replce = kwargs['replace']
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)
        data = self._load_data(n_samples)

        # find cumulative distribution (sum)
        data_values = data.astype(int).values
        n_all_reads = np.sum(data_values)
        data_cumsum = np.reshape(np.cumsum(data_values), data_values.shape)

        # Sample from original dataset
        new_reads = np.sort(
            np.random.choice(n_all_reads,
                             int(read_ratio * n_all_reads),
                             replace=replce))

        low_quality_data = np.zeros_like(data_values)
        read_index = 0
        for x in range(data_values.shape[0]):
            for y in range(data_values.shape[1]):
                while read_index < len(
                        new_reads) and new_reads[read_index] < data_cumsum[x,
                                                                           y]:
                    low_quality_data[x, y] += 1
                    read_index += 1

        # Convert to data frame
        low_quality_data = pd.DataFrame(low_quality_data,
                                        index=data.index,
                                        columns=data.columns)

        # Shuffle columns
        low_quality_data, original_columns, column_permutation = \
            shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns)

        # Remove zero rows
        data = data[np.sum(low_quality_data, axis=1) > 0].copy()
        low_quality_data = low_quality_data[
            np.sum(low_quality_data, axis=1) > 0].copy()

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            data.to_sparse(), read_ratio, original_columns, column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(low_quality_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Example #5
0
    def generate_test_bench(self, count_file_path, **kwargs):
        count_file_path = os.path.abspath(count_file_path)
        rm_ercc = kwargs['rm_ercc']
        rm_mt = kwargs['rm_mt']
        rm_lq = kwargs['rm_lq']
        preserve_columns = kwargs['preserve_columns']

        # Load dataset
        data = self._load_and_combine_data()

        # Remove some rows and columns
        if rm_ercc:
            remove_list = [
                symbol for symbol in data.index.values
                if symbol.startswith("ERCC-")
            ]
            data = data.drop(remove_list)
        if rm_mt:
            remove_list = [
                symbol for symbol in data.index.values
                if symbol.startswith("mt-")
            ]
            data = data.drop(remove_list)
        if rm_lq:
            remove_list = data.columns.values[data.sum(axis=0) < 1e6]
            data = data.drop(columns=remove_list)
        # Remove empty rows
        remove_list = data.index.values[data.sum(axis=1) == 0]
        data = data.drop(remove_list)

        # Shuffle columns
        new_data, original_columns, column_permutation = shuffle_and_rename_columns(
            data, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle(
            [data.to_sparse(), original_columns, column_permutation],
            hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(new_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)

        return None