def generate_test_bench(self, count_file_path, **kwargs): preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) count_rna = self.data_set.get("RNA") count_adt = self.data_set.get("ADT") # Shuffle columns count_rna, original_columns, column_permutation = \ shuffle_and_rename_columns(count_rna, disabled=preserve_columns) # Remove zero rows count_rna = count_rna[np.sum(count_rna, axis=1) > 0].copy() # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ count_rna.to_sparse(), original_columns, column_permutation, count_adt.to_sparse(), self.protein_rna_mapping ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(count_rna, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) count_matrix, classes = self._load_data() # Remove zero rows count_matrix = count_matrix[np.sum(count_matrix, axis=1) > 0].copy() # Shuffle columns count_matrix, original_columns, column_permutation = \ shuffle_and_rename_columns(count_matrix, disabled=preserve_columns) # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ count_matrix.to_sparse(), classes, original_columns, column_permutation ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(count_matrix, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): n_samples = kwargs['n_samples'] dropout_count = kwargs['dropout_count'] min_expression = kwargs['min_expression'] hvg_frac = kwargs['hvg_frac'] preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) data = self._load_data(n_samples) hvg_indices = self.get_hvg_genes(data, hvg_frac) # Generate elimination mask non_zero_locations = [] data_values = data.values for x in hvg_indices: for y in range(data.shape[1]): if data_values[x, y] >= min_expression: non_zero_locations.append((x, y)) del data_values mask = np.zeros_like(data) masked_locations = [ non_zero_locations[index] for index in np.random.choice( len(non_zero_locations), dropout_count, replace=False) ] for (x, y) in masked_locations: mask[x, y] = 1 mask = pd.DataFrame(mask, index=data.index, columns=data.columns) # Elimination low_quality_data = data * (1 - mask.values) is_nonzero = np.sum(low_quality_data, axis=1) > 0 mask = mask[is_nonzero].copy() data = data[is_nonzero].copy() low_quality_data = low_quality_data[is_nonzero].copy() # Shuffle columns low_quality_data, original_columns, column_permutation = \ shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns) # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ data.to_sparse(), mask.to_sparse(), original_columns, column_permutation ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(low_quality_data, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): n_samples = kwargs['n_samples'] read_ratio = kwargs['read_ratio'] replce = kwargs['replace'] preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) data = self._load_data(n_samples) # find cumulative distribution (sum) data_values = data.astype(int).values n_all_reads = np.sum(data_values) data_cumsum = np.reshape(np.cumsum(data_values), data_values.shape) # Sample from original dataset new_reads = np.sort( np.random.choice(n_all_reads, int(read_ratio * n_all_reads), replace=replce)) low_quality_data = np.zeros_like(data_values) read_index = 0 for x in range(data_values.shape[0]): for y in range(data_values.shape[1]): while read_index < len( new_reads) and new_reads[read_index] < data_cumsum[x, y]: low_quality_data[x, y] += 1 read_index += 1 # Convert to data frame low_quality_data = pd.DataFrame(low_quality_data, index=data.index, columns=data.columns) # Shuffle columns low_quality_data, original_columns, column_permutation = \ shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns) # Remove zero rows data = data[np.sum(low_quality_data, axis=1) > 0].copy() low_quality_data = low_quality_data[ np.sum(low_quality_data, axis=1) > 0].copy() # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ data.to_sparse(), read_ratio, original_columns, column_permutation ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(low_quality_data, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): count_file_path = os.path.abspath(count_file_path) rm_ercc = kwargs['rm_ercc'] rm_mt = kwargs['rm_mt'] rm_lq = kwargs['rm_lq'] preserve_columns = kwargs['preserve_columns'] # Load dataset data = self._load_and_combine_data() # Remove some rows and columns if rm_ercc: remove_list = [ symbol for symbol in data.index.values if symbol.startswith("ERCC-") ] data = data.drop(remove_list) if rm_mt: remove_list = [ symbol for symbol in data.index.values if symbol.startswith("mt-") ] data = data.drop(remove_list) if rm_lq: remove_list = data.columns.values[data.sum(axis=0) < 1e6] data = data.drop(columns=remove_list) # Remove empty rows remove_list = data.index.values[data.sum(axis=1) == 0] data = data.drop(remove_list) # Shuffle columns new_data, original_columns, column_permutation = shuffle_and_rename_columns( data, disabled=preserve_columns) # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle( [data.to_sparse(), original_columns, column_permutation], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(new_data, count_file_path) log("Count file saved to `%s`" % count_file_path) return None
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] clear_cache = kwargs['clear_cache'] make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] # Load hidden state and data count_matrix, classes, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Data transformations if np.sum(imputed_data.values < 0) > 0: log("Observed some negative values!") imputed_data[imputed_data < 0] = 0 imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) # Save class details for future write_csv(classes, os.path.join(result_dir, "files", "classes.csv")) # Evaluation metric_results = dict() embedded_data_file_path = os.path.join(result_dir, "files", "embedded_data.pkl.gz") if os.path.exists(embedded_data_file_path) and not clear_cache: embedded_data = load_gzip_pickle(embedded_data_file_path) else: embedded_data = self._get_embeddings(imputed_data) dump_gzip_pickle(embedded_data, embedded_data_file_path) log("Evaluating ...") for class_label in classes.index.values: class_names = classes.loc[class_label].values for embedding_name in embedded_data: emb, emb_2d = embedded_data[embedding_name] embedding_slug = embedding_name.replace(" ", "_").lower() k_means = KMeans(n_clusters=len(set(class_names))) k_means.fit(emb) clusters = k_means.predict(emb) embedding_df = pd.DataFrame(emb) embedding_df["X"] = emb_2d[:, 0] embedding_df["Y"] = emb_2d[:, 1] embedding_df["class"] = class_names embedding_df["k_means_clusters"] = clusters write_csv( embedding_df, os.path.join(result_dir, "files", "%s_%s.csv" % (class_label, embedding_slug))) info.append({ 'filename': "%s_%s.csv" % (class_label, embedding_slug), 'description': '%s embedding of cells along %s labels' % (embedding_name, class_label), 'plot_description': '%s embedding of cells along %s labels (Classes can be identified ' 'with their colors and K-means clusters are marked ' 'with different shapes)' % (embedding_name, class_label), }) metric_results.update({ 'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label): adjusted_mutual_info_score(class_names, clusters, average_method="arithmetic"), 'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label): v_measure_score(class_names, clusters), 'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label): calinski_harabaz_score(emb, class_names), 'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label): silhouette_score(emb, class_names) }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] clear_cache = kwargs['clear_cache'] make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] data, imputed_data = self._load_data_and_imputed_data_for_evaluation( processed_count_file) gold_standard_classes = [ column_name.split("_")[0] for column_name in data.columns.values ] # Data transformations if np.sum(imputed_data.values < 0) > 0: log("Observed some negative values!") imputed_data[imputed_data < 0] = 0 imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data = self._get_related_part( imputed_data) related_part_of_imputed_data = pd.concat([ G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data ]) write_csv( G1_S_related_part_of_imputed_data, os.path.join(result_dir, "files", "G1_S_related_part_of_imputed_data.csv")) info.append({ 'filename': 'G1_S_related_part_of_imputed_data.csv', 'description': 'Vales of genes related to G1/S', 'plot_description': 'Heatmap of Genes related to G1/S', }) write_csv( G2_M_related_part_of_imputed_data, os.path.join(result_dir, "files", "G2_M_related_part_of_imputed_data.csv")) info.append({ 'filename': 'G2_M_related_part_of_imputed_data.csv', 'description': 'Vales of genes related to G2/M', 'plot_description': 'Heatmap of Genes related to G2/M', }) svm_results, knn_results = self._get_classification_results( related_part_of_imputed_data, gold_standard_classes) embedded_data_file_path = os.path.join(result_dir, "files", "embedded_data.pkl.gz") if os.path.exists(embedded_data_file_path) and not clear_cache: embedded_data = load_gzip_pickle(embedded_data_file_path) else: embedded_data = self._get_embeddings(related_part_of_imputed_data) dump_gzip_pickle(embedded_data, embedded_data_file_path) metric_results = { "classification_svm_mean_accuracy": np.mean(svm_results), "classification_knn_mean_accuracy": np.mean(knn_results) } embedded_data["identity"] = related_part_of_imputed_data.transpose() for i, embedding_name in enumerate(embedded_data): emb = embedded_data[embedding_name] k_means = KMeans(n_clusters=3) k_means.fit(emb) clusters = k_means.predict(emb) embedding_slug = embedding_name.replace(" ", "_").lower() if embedding_name != "identity": embedding_df = pd.DataFrame( { "X": emb[:, 0], "Y": emb[:, 1], "class": gold_standard_classes, "k_means_clusters": clusters }, index=data.columns.values) write_csv( embedding_df, os.path.join(result_dir, "files", "%s.csv" % embedding_slug)) info.append({ 'filename': "%s.csv" % embedding_slug, 'description': '%s embedding of cells considering genes related ' 'to cell-cycle' % embedding_name, 'plot_description': '%s embedding of cells considering genes related ' 'to cell-cycle (K-means clusters are marked ' 'with different shapes)' % embedding_name, }) metric_results.update({ 'kmeans_on_%s_adjusted_mutual_info_score' % embedding_slug: adjusted_mutual_info_score(gold_standard_classes, clusters, average_method="arithmetic"), 'kmeans_on_%s_v_measure_score' % embedding_slug: v_measure_score(gold_standard_classes, clusters), 'embedding_%s_calinski_harabaz_score' % embedding_slug: calinski_harabaz_score(emb, gold_standard_classes), 'embedding_%s_silhouette_score' % embedding_slug: silhouette_score(emb, gold_standard_classes) }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") file.write("## SVM classifiers accuracies: %s\n" % str(svm_results)) file.write("## KNN classifiers accuracies: %s\n" % str(knn_results)) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results