def _load_data_and_imputed_data_for_evaluation(self, processed_count_file): hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) sparse_data, original_columns, column_permutation = load_gzip_pickle( hidden_data_file_path) data = sparse_data.to_dense() del sparse_data imputed_data = read_table_file(processed_count_file) # Restoring original column names imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Remove (error correction) ERCC and mitochondrial RNAs remove_list = [ symbol for symbol in imputed_data.index.values if symbol.startswith("ERCC-") or symbol.startswith("mt-") ] imputed_data = imputed_data.drop(remove_list) data = data.drop(remove_list) return data, imputed_data
def read_table_file(filename): if filename.endswith(".csv") or filename.endswith(".tsv") or \ filename.endswith(".csv.gz") or filename.endswith(".tsv.gz"): return read_csv(filename) elif filename.endswith(".pkl.gz"): return load_gzip_pickle(filename) else: raise NotImplementedError("Unrecognized format for file %s" % filename)
def _load_hidden_state(self): hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) sparse_data, read_ratio, original_columns, column_permutation = load_gzip_pickle( hidden_data_file_path) scaled_data = sparse_data.to_dense() * read_ratio return scaled_data, original_columns, column_permutation
def _load_hidden_state(self): hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) sparse_count_matrix, classes, original_columns, column_permutation = load_gzip_pickle( hidden_data_file_path) count_matrix = sparse_count_matrix.to_dense() del sparse_count_matrix return count_matrix, classes, original_columns, column_permutation
def _load_hidden_state(self): hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) sparse_data, sparse_mask, original_columns, column_permutation = \ load_gzip_pickle(hidden_data_file_path) data = sparse_data.to_dense() mask = sparse_mask.to_dense() del sparse_data del sparse_mask return data, mask, original_columns, column_permutation
def _load_hidden_state(self): hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) sparse_count_rna, original_columns, column_permutation, sparse_count_adt, protein_rna_mapping = \ load_gzip_pickle(hidden_data_file_path) count_rna = sparse_count_rna.to_dense() count_adt = sparse_count_adt.to_dense() del sparse_count_rna del sparse_count_adt return count_rna, original_columns, column_permutation, count_adt, protein_rna_mapping
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] clear_cache = kwargs['clear_cache'] make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] # Load hidden state and data count_matrix, classes, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Data transformations if np.sum(imputed_data.values < 0) > 0: log("Observed some negative values!") imputed_data[imputed_data < 0] = 0 imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) # Save class details for future write_csv(classes, os.path.join(result_dir, "files", "classes.csv")) # Evaluation metric_results = dict() embedded_data_file_path = os.path.join(result_dir, "files", "embedded_data.pkl.gz") if os.path.exists(embedded_data_file_path) and not clear_cache: embedded_data = load_gzip_pickle(embedded_data_file_path) else: embedded_data = self._get_embeddings(imputed_data) dump_gzip_pickle(embedded_data, embedded_data_file_path) log("Evaluating ...") for class_label in classes.index.values: class_names = classes.loc[class_label].values for embedding_name in embedded_data: emb, emb_2d = embedded_data[embedding_name] embedding_slug = embedding_name.replace(" ", "_").lower() k_means = KMeans(n_clusters=len(set(class_names))) k_means.fit(emb) clusters = k_means.predict(emb) embedding_df = pd.DataFrame(emb) embedding_df["X"] = emb_2d[:, 0] embedding_df["Y"] = emb_2d[:, 1] embedding_df["class"] = class_names embedding_df["k_means_clusters"] = clusters write_csv( embedding_df, os.path.join(result_dir, "files", "%s_%s.csv" % (class_label, embedding_slug))) info.append({ 'filename': "%s_%s.csv" % (class_label, embedding_slug), 'description': '%s embedding of cells along %s labels' % (embedding_name, class_label), 'plot_description': '%s embedding of cells along %s labels (Classes can be identified ' 'with their colors and K-means clusters are marked ' 'with different shapes)' % (embedding_name, class_label), }) metric_results.update({ 'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label): adjusted_mutual_info_score(class_names, clusters, average_method="arithmetic"), 'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label): v_measure_score(class_names, clusters), 'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label): calinski_harabaz_score(emb, class_names), 'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label): silhouette_score(emb, class_names) }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] clear_cache = kwargs['clear_cache'] make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] data, imputed_data = self._load_data_and_imputed_data_for_evaluation( processed_count_file) gold_standard_classes = [ column_name.split("_")[0] for column_name in data.columns.values ] # Data transformations if np.sum(imputed_data.values < 0) > 0: log("Observed some negative values!") imputed_data[imputed_data < 0] = 0 imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data = self._get_related_part( imputed_data) related_part_of_imputed_data = pd.concat([ G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data ]) write_csv( G1_S_related_part_of_imputed_data, os.path.join(result_dir, "files", "G1_S_related_part_of_imputed_data.csv")) info.append({ 'filename': 'G1_S_related_part_of_imputed_data.csv', 'description': 'Vales of genes related to G1/S', 'plot_description': 'Heatmap of Genes related to G1/S', }) write_csv( G2_M_related_part_of_imputed_data, os.path.join(result_dir, "files", "G2_M_related_part_of_imputed_data.csv")) info.append({ 'filename': 'G2_M_related_part_of_imputed_data.csv', 'description': 'Vales of genes related to G2/M', 'plot_description': 'Heatmap of Genes related to G2/M', }) svm_results, knn_results = self._get_classification_results( related_part_of_imputed_data, gold_standard_classes) embedded_data_file_path = os.path.join(result_dir, "files", "embedded_data.pkl.gz") if os.path.exists(embedded_data_file_path) and not clear_cache: embedded_data = load_gzip_pickle(embedded_data_file_path) else: embedded_data = self._get_embeddings(related_part_of_imputed_data) dump_gzip_pickle(embedded_data, embedded_data_file_path) metric_results = { "classification_svm_mean_accuracy": np.mean(svm_results), "classification_knn_mean_accuracy": np.mean(knn_results) } embedded_data["identity"] = related_part_of_imputed_data.transpose() for i, embedding_name in enumerate(embedded_data): emb = embedded_data[embedding_name] k_means = KMeans(n_clusters=3) k_means.fit(emb) clusters = k_means.predict(emb) embedding_slug = embedding_name.replace(" ", "_").lower() if embedding_name != "identity": embedding_df = pd.DataFrame( { "X": emb[:, 0], "Y": emb[:, 1], "class": gold_standard_classes, "k_means_clusters": clusters }, index=data.columns.values) write_csv( embedding_df, os.path.join(result_dir, "files", "%s.csv" % embedding_slug)) info.append({ 'filename': "%s.csv" % embedding_slug, 'description': '%s embedding of cells considering genes related ' 'to cell-cycle' % embedding_name, 'plot_description': '%s embedding of cells considering genes related ' 'to cell-cycle (K-means clusters are marked ' 'with different shapes)' % embedding_name, }) metric_results.update({ 'kmeans_on_%s_adjusted_mutual_info_score' % embedding_slug: adjusted_mutual_info_score(gold_standard_classes, clusters, average_method="arithmetic"), 'kmeans_on_%s_v_measure_score' % embedding_slug: v_measure_score(gold_standard_classes, clusters), 'embedding_%s_calinski_harabaz_score' % embedding_slug: calinski_harabaz_score(emb, gold_standard_classes), 'embedding_%s_silhouette_score' % embedding_slug: silhouette_score(emb, gold_standard_classes) }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") file.write("## SVM classifiers accuracies: %s\n" % str(svm_results)) file.write("## KNN classifiers accuracies: %s\n" % str(knn_results)) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results