def visualize_result(self, result_dir, output_type, **kwargs): info = read_table_file(os.path.join(result_dir, "files", "info.csv")) info = info.set_index("filename") classes = read_table_file( os.path.join(result_dir, "files", "classes.csv")) embeddings = ["PCA", "ICA", "Truncated SVD", "tSNE", "UMAP"] if output_type == "pdf": from plotly import graph_objs as go, io as pio for class_label in classes.index.values: class_names = classes.loc[class_label].astype("str").values for embedding_name in embeddings: embedding_slug = embedding_name.replace(" ", "_").lower() filename = "%s_%s.csv" % (class_label, embedding_slug) embedded_df = read_table_file( os.path.join(result_dir, "files", filename)) fig = go.Figure(layout=go.Layout(title=info.loc[filename] ["plot_description"], font=dict(size=8))) color_scale = self._get_color_scales(class_names) clusters = embedded_df["k_means_clusters"].values X = embedded_df["X"].values Y = embedded_df["Y"].values for i, class_name in enumerate( list(sorted(set(class_names)))): indices = [ j for j, c in enumerate(class_names) if c == class_name ] color = color_scale[i] fig.add_scatter(x=X[indices], y=Y[indices], mode='markers', marker=dict( color=color, opacity=0.5, symbol=[ ployly_symbols[c] for c in clusters[indices] ]), name=class_name) pio.write_image(fig, os.path.join( result_dir, "plot_%s_%s.pdf" % (class_label, embedding_slug)), width=800, height=600) elif output_type == "html": raise NotImplementedError() else: raise NotImplementedError()
def visualize_result(self, result_dir, output_type, **kwargs): info = read_table_file(os.path.join(result_dir, "files", "info.csv")) info = info.set_index("filename") spearman_correlations = read_table_file( os.path.join(result_dir, "files", "spearman_correlations.csv")) pearson_correlations = read_table_file( os.path.join(result_dir, "files", "pearson_correlations.csv")) n = spearman_correlations.shape[0] // 2 adt_adt_spearmanr = spearman_correlations.iloc[:n, :n] rna_rna_spearmanr = spearman_correlations.iloc[n:, n:] adt_rna_spearmanr = spearman_correlations.iloc[:n, n:] adt_adt_pearsonr = pearson_correlations.iloc[:n, :n] rna_rna_pearsonr = pearson_correlations.iloc[n:, n:] adt_rna_pearsonr = pearson_correlations.iloc[:n, n:] if output_type == "pdf": import plotly.graph_objs as go import plotly.io as pio plots = [ ("Pairwise Spearman correlations between ADT values", adt_adt_spearmanr, "heatmap_adt_adt_spearmanr.pdf"), ("Pairwise Spearman correlations between RNA values", rna_rna_spearmanr, "heatmap_rna_rna_spearmanr.pdf"), ("Pairwise Spearman correlations between ADT and RNA values", adt_rna_spearmanr, "heatmap_adt_rna_spearmanr.pdf"), ("Pairwise Pearson correlations between ADT values", adt_adt_pearsonr, "heatmap_adt_adt_pearsonr.pdf"), ("Pairwise Pearson correlations between RNA values", rna_rna_pearsonr, "heatmap_rna_rna_pearsonr.pdf"), ("Pairwise Pearson correlations between ADT and RNA values", adt_rna_pearsonr, "heatmap_adt_rna_pearsonr.pdf") ] for title, data_frame, filename in plots: fig = go.Figure( layout=go.Layout(title=title, font=dict(size=9))) fig.add_heatmap(z=data_frame.values, x=data_frame.columns.values, y=data_frame.index.values, colorscale='Picnic') # RdBu is also good pio.write_image(fig, os.path.join(result_dir, filename), width=600, height=700) elif output_type == "html": print("Nothing to visualize") else: raise NotImplementedError()
def _load_data_and_imputed_data_for_evaluation(self, processed_count_file): hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) sparse_data, original_columns, column_permutation = load_gzip_pickle( hidden_data_file_path) data = sparse_data.to_dense() del sparse_data imputed_data = read_table_file(processed_count_file) # Restoring original column names imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Remove (error correction) ERCC and mitochondrial RNAs remove_list = [ symbol for symbol in imputed_data.index.values if symbol.startswith("ERCC-") or symbol.startswith("mt-") ] imputed_data = imputed_data.drop(remove_list) data = data.drop(remove_list) return data, imputed_data
def visualize_result(self, result_dir, output_type, **kwargs): info = read_table_file(os.path.join(result_dir, "files", "info.csv")) info = info.set_index("filename") predictions = read_table_file( os.path.join(result_dir, "files", "predictions.csv")) original_values = predictions["original"] predicted_values = predictions["predicted"] if output_type == "pdf": import plotly.graph_objs as go import plotly.io as pio max_axis = float(max(original_values.max(), predicted_values.max())) for transformation_name in ["log", "sqrt"]: transformation = transformations[transformation_name] fig = go.Figure(layout=go.Layout( title= 'Predicted values vs. original masked values (%s scale)' % transformation_name, font=dict(size=12), xaxis=go.layout.XAxis( range=[0, transformation(max_axis)]), yaxis=go.layout.YAxis( range=[0, transformation(max_axis)]))) fig.add_scatter(x=transformation(original_values), y=transformation(predicted_values), mode='markers', marker=dict(opacity=0.3)) pio.write_image( fig, os.path.join( result_dir, "prediction_plot_%s_scale.pdf" % transformation_name), width=800, height=800) elif output_type == "html": raise NotImplementedError() else: raise NotImplementedError()
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] clear_cache = kwargs['clear_cache'] make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] # Load hidden state and data count_matrix, classes, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Data transformations if np.sum(imputed_data.values < 0) > 0: log("Observed some negative values!") imputed_data[imputed_data < 0] = 0 imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) # Save class details for future write_csv(classes, os.path.join(result_dir, "files", "classes.csv")) # Evaluation metric_results = dict() embedded_data_file_path = os.path.join(result_dir, "files", "embedded_data.pkl.gz") if os.path.exists(embedded_data_file_path) and not clear_cache: embedded_data = load_gzip_pickle(embedded_data_file_path) else: embedded_data = self._get_embeddings(imputed_data) dump_gzip_pickle(embedded_data, embedded_data_file_path) log("Evaluating ...") for class_label in classes.index.values: class_names = classes.loc[class_label].values for embedding_name in embedded_data: emb, emb_2d = embedded_data[embedding_name] embedding_slug = embedding_name.replace(" ", "_").lower() k_means = KMeans(n_clusters=len(set(class_names))) k_means.fit(emb) clusters = k_means.predict(emb) embedding_df = pd.DataFrame(emb) embedding_df["X"] = emb_2d[:, 0] embedding_df["Y"] = emb_2d[:, 1] embedding_df["class"] = class_names embedding_df["k_means_clusters"] = clusters write_csv( embedding_df, os.path.join(result_dir, "files", "%s_%s.csv" % (class_label, embedding_slug))) info.append({ 'filename': "%s_%s.csv" % (class_label, embedding_slug), 'description': '%s embedding of cells along %s labels' % (embedding_name, class_label), 'plot_description': '%s embedding of cells along %s labels (Classes can be identified ' 'with their colors and K-means clusters are marked ' 'with different shapes)' % (embedding_name, class_label), }) metric_results.update({ 'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label): adjusted_mutual_info_score(class_names, clusters, average_method="arithmetic"), 'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label): v_measure_score(class_names, clusters), 'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label): calinski_harabaz_score(emb, class_names), 'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label): silhouette_score(emb, class_names) }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] # Load hidden state and data count_matrix_lq, original_columns, column_permutation, count_matrix_hq = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Replace negative values with zero imputed_data = imputed_data.clip(lower=0) # Data transformations imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) count_matrix_hq = transformations[transformation]( normalizations[normalization](count_matrix_hq)) # Evaluation rmse_distances = [] mae_distances = [] euclidean_distances = [] cosine_distances = [] correlation_distances = [] for i in range(count_matrix_hq.shape[1]): non_zeros = np.logical_and(count_matrix_hq.values[:, i] > 0, count_matrix_lq.values[:, i] == 0) hq = count_matrix_hq.values[non_zeros, i] lq = count_matrix_lq.values[non_zeros, i] y = imputed_data.values[non_zeros, i] if np.sum(y) > 0: y = y * np.sum(hq) / np.sum(y) rmse_distances.append(float(np.mean(np.square(hq - y)**0.5))) mae_distances.append(float(np.mean(np.abs(hq - y)))) euclidean_distances.append( pdist(np.vstack((hq, y)), 'euclidean')[0]) cosine_distances.append(pdist(np.vstack((hq, y)), 'cosine')[0]) correlation_distances.append( pdist(np.vstack((hq, y)), 'correlation')[0]) metric_results = { 'cell_root_mean_squared_error': np.mean(rmse_distances), 'cell_mean_absolute_error': np.mean(mae_distances), 'cell_mean_euclidean_distance': np.mean(euclidean_distances), 'cell_mean_cosine_distance': np.mean(cosine_distances), 'cell_mean_correlation_distance': np.mean(correlation_distances) } # Save results to a file make_sure_dir_exists(os.path.join(result_dir, "files")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, float(metric_results[metric]))) file.write("##\n## ADDITIONAL INFO:\n") file.write( "# CELL\troot_mean_squared_error\tmean_absolute_error\tmean_euclidean_distance\t" "mean_cosine_distance\tmean_correlation_distance:\n") for i in range(count_matrix_hq.shape[1]): file.write( "# %s\t%f\t%f\t%f\t%f\t%f\n" % (count_matrix_hq.columns.values[i], rmse_distances[i], mae_distances[i], euclidean_distances[i], cosine_distances[i], correlation_distances[i])) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): make_sure_dir_exists(os.path.join(result_dir, "files")) transformation = kwargs['transformation'] # Load hidden state and data _, original_columns, column_permutation, count_adt, protein_rna_mapping = self._load_hidden_state( ) # Load imputed data imputed_rna = read_table_file(processed_count_file_path) # Restore column names and order imputed_rna = rearrange_and_rename_columns(imputed_rna, original_columns, column_permutation) # Data transformations imputed_rna = transformations[transformation](imputed_rna) count_adt = transformations[transformation](count_adt) # Use related data adt = count_adt.loc[[ prot for prot in count_adt.index.values if (protein_rna_mapping[prot] in imputed_rna.index.values) ]].copy() adt.index = ["prot_" + p for p in adt.index.values] rna = imputed_rna.loc[[ protein_rna_mapping[prot] for prot in count_adt.index.values if (protein_rna_mapping[prot] in imputed_rna.index.values) ]] rna.index = ["gene_" + g for g in rna.index.values] info = [] write_csv(adt, os.path.join(result_dir, "files", "adt.csv")) info.append({ 'filename': "adt.csv", 'description': 'Protein expressions (adt) after transformation', 'plot_description': 'Protein expressions (adt) after transformation', }) write_csv(rna, os.path.join(result_dir, "files", "rna.csv")) info.append({ 'filename': "rna.csv", 'description': 'Gene expressions of genes related to adt data after transformation', 'plot_description': 'Gene expressions of genes related to adt data after transformation', }) n = adt.shape[0] # Calculating Spearman correlations combined_df = pd.concat((adt, rna)).transpose() correlations = combined_df.corr(method="spearman") adt_adt_spearmanr = correlations.iloc[:n, :n] rna_rna_spearmanr = correlations.iloc[n:, n:] adt_rna_spearmanr = correlations.iloc[:n, n:] write_csv( correlations, os.path.join(result_dir, "files", "spearman_correlations.csv")) info.append({ 'filename': "spearman_correlations.csv", 'description': 'Pairwise Spearman correlations (first n items are ' 'adt expressions and second n items are rna expressions)', 'plot_description': 'Pairwise Spearman correlations (first n items are ' 'adt expressions and second n items are rna expressions)', }) # Calculating Pearson correlations combined_df = pd.concat((adt, rna)).transpose() correlations = combined_df.corr(method="pearson") adt_adt_pearsonr = correlations.iloc[:n, :n] rna_rna_pearsonr = correlations.iloc[n:, n:] adt_rna_pearsonr = correlations.iloc[:n, n:] write_csv( correlations, os.path.join(result_dir, "files", "pearson_correlations.csv")) info.append({ 'filename': "pearson_correlations.csv", 'description': 'Pairwise Pearson correlations (first n items are ' 'adt expressions and second n items are rna expressions)', 'plot_description': 'Pairwise Pearson correlations (first n items are ' 'adt expressions and second n items are rna expressions)', }) # Evaluation metric_results = { 'rna_protein_mean_spearman_correlatoin': np.mean(adt_rna_spearmanr.values.diagonal()), 'rna_protein_mean_pearson_correlatoin': np.mean(adt_rna_pearsonr.values.diagonal()), 'MSE_of_adt_adt_and_rna_rna_spearman_correlations': np.mean((adt_adt_spearmanr.values - rna_rna_spearmanr.values)**2), 'MSE_of_adt_adt_and_rna_rna_pearson_correlations': np.mean((adt_adt_pearsonr.values - rna_rna_pearsonr.values)**2) } write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) # Save results to a file result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, float(metric_results[metric]))) file.write("##\n## ADDITIONAL INFO:\n") file.write("## Pearson of adt/rna:\n") file.write("## " + "\n## ".join(adt_rna_pearsonr.to_string().split("\n")) + "\n") file.write('## Spearman of adt/rna:\n') file.write( "## " + "\n## ".join(adt_rna_spearmanr.to_string().split("\n")) + "\n") file.write("## Pearson of adt/adt:\n") file.write("## " + "\n## ".join(adt_adt_pearsonr.to_string().split("\n")) + "\n") file.write("## Pearson of rna/rna:\n") file.write("## " + "\n## ".join(rna_rna_pearsonr.to_string().split("\n")) + "\n") file.write('## Spearman of adt/adt:\n') file.write( "## " + "\n## ".join(adt_adt_spearmanr.to_string().split("\n")) + "\n") file.write('## Spearman of rna/rna:\n') file.write( "## " + "\n## ".join(rna_rna_spearmanr.to_string().split("\n")) + "\n") log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def visualize_result(self, result_dir, output_type, **kwargs): info = read_table_file(os.path.join(result_dir, "files", "info.csv")) info = info.set_index("filename") G1_S_related_part_of_imputed_data = read_table_file( os.path.join(result_dir, "files", "G1_S_related_part_of_imputed_data.csv")) G2_M_related_part_of_imputed_data = read_table_file( os.path.join(result_dir, "files", "G2_M_related_part_of_imputed_data.csv")) embeddings = ["PCA", "ICA", "Truncated SVD", "tSNE", "UMAP"] embedded_dfs = dict() for embedding_name in embeddings: embedding_slug = embedding_name.replace(" ", "_").lower() embedded_dfs[embedding_name] = read_table_file( os.path.join(result_dir, "files", "%s.csv" % embedding_slug)) if output_type == "pdf": import plotly.graph_objs as go import plotly.io as pio G1_S_heatmap_fig = go.Figure(layout=go.Layout( title='Heatmap of Genes related to G1/S', font=dict(size=5), xaxis=dict(title='Marker Genes', tickangle=60))) G2_M_heatmap_fig = go.Figure(layout=go.Layout( title='Heatmap of Genes related to G2/M', font=dict(size=5), xaxis=dict(title='Marker Genes', tickangle=60))) def normalize(df): return df.subtract(df.mean(axis=1), axis=0).divide(df.std(axis=1), axis=0) G1_S_heatmap_fig.add_heatmap( z=normalize(G1_S_related_part_of_imputed_data).values.T, x=G1_S_related_part_of_imputed_data.index.values, y=G1_S_related_part_of_imputed_data.columns.values, colorscale='Viridis') G2_M_heatmap_fig.add_heatmap( z=normalize(G2_M_related_part_of_imputed_data).values.T, x=G2_M_related_part_of_imputed_data.index.values, y=G2_M_related_part_of_imputed_data.columns.values, colorscale='Viridis') pio.write_image(G1_S_heatmap_fig, os.path.join( result_dir, "plot_G1_S_related_genes_heatmap.pdf"), width=600, height=700) pio.write_image(G2_M_heatmap_fig, os.path.join( result_dir, "plot_G2_M_related_genes_heatmap.pdf"), width=600, height=700) embeddings = ["PCA", "ICA", "Truncated SVD", "tSNE", "UMAP"] for i, embedding_name in enumerate(embeddings): embedding_slug = embedding_name.replace(" ", "_").lower() fig = go.Figure(layout=go.Layout( title='%s embedding of cells considering genes related ' 'to cell-cycle (K-means clusters are marked ' 'with different shapes)' % embedding_name, font=dict(size=8))) embedding_df = embedded_dfs[embedding_name] X = embedding_df["X"].values Y = embedding_df["Y"].values classes = embedding_df["class"].values clusters = embedding_df["k_means_clusters"].values for j, state in enumerate(["G1", "G2M", "S"]): indices = [k for k, c in enumerate(classes) if c == state] fig.add_scatter(x=X[indices], y=Y[indices], mode='markers', marker=dict(color=["red", "green", "blue"][j], symbol=[[ "circle-open", "diamond", "cross" ][c] for c in clusters[indices] ]), name="%s Phase" % state) pio.write_image(fig, os.path.join(result_dir, "plot_%s.pdf" % embedding_slug), width=800, height=600) elif output_type == "html": raise NotImplementedError() else: raise NotImplementedError()
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): transformation = kwargs['transformation'] make_sure_dir_exists(os.path.join(result_dir, "files")) # Load hidden state and data scaled_data, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Replace negative values with zero imputed_data = imputed_data.clip(lower=0) # Data transformation scaled_data = transformations[transformation](scaled_data) imputed_data = transformations[transformation](imputed_data) # Evaluation rmse_distances = [] mae_distances = [] euclidean_distances = [] cosine_distances = [] correlation_distances = [] rmse = float( np.sum( np.where(scaled_data.values > 0, 1, 0) * np.square(scaled_data.values - imputed_data.values)) / np.sum(np.where(scaled_data.values > 0, 1, 0)))**0.5 mae = float( np.sum( np.where(scaled_data.values > 0, 1, 0) * np.abs(scaled_data.values - imputed_data.values)) / np.sum(np.where(scaled_data.values > 0, 1, 0))) for i in range(scaled_data.shape[1]): non_zeros = scaled_data.values[:, i] > 0 x = scaled_data.values[non_zeros, i] y = imputed_data.values[non_zeros, i] rmse_distances.append( float(np.sum(np.square(x - y)) / np.sum(non_zeros))**0.5) mae_distances.append( float(np.sum(np.abs(x - y)) / np.sum(non_zeros))) cosine_distances.append(pdist(np.vstack((x, y)), 'cosine')[0]) euclidean_distances.append( pdist(np.vstack((x, y)), 'euclidean')[0]) correlation_distances.append( pdist(np.vstack((x, y)), 'correlation')[0]) metric_results = { 'all_mean_absolute_error_on_non_zeros': mae, 'all_root_mean_squared_error_on_non_zeros': rmse, 'cell_mean_mean_absolute_error_on_non_zeros': np.mean(mae_distances), 'cell_mean_root_mean_squared_error_on_non_zeros': np.mean(rmse_distances), 'cell_mean_euclidean_distance': np.mean(euclidean_distances), 'cell_mean_cosine_distance': np.mean(cosine_distances), 'cell_mean_correlation_distance': np.mean(correlation_distances), } # Save results to a file result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, float(metric_results[metric]))) file.write("##\n## ADDITIONAL INFO:\n") file.write( "# CELL\troot_mean_squared_error_on_non_zeros\tmean_absolute_error_on_non_zeros\t" "euclidean_distance_on_non_zeros\tcosine_distance_on_non_zeros\tcorrelation_distance_on_non_zeros:\n" ) for i in range(scaled_data.shape[1]): file.write("# %s\t%f\t%f\t%f\t%f\t%f\n" % (scaled_data.columns.values[i], rmse_distances[i], mae_distances[i], euclidean_distances[i], cosine_distances[i], correlation_distances[i])) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] # Load hidden state and data data, mask, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Replace negative values with zero imputed_data = imputed_data.clip(lower=0) # Evaluation log_diff = np.abs(transformations["log"](data) - transformations["log"](imputed_data)) sqrt_diff = np.abs(transformations["sqrt"](data) - transformations["sqrt"](imputed_data)) mse_on_log = float( np.sum( np.sum(mask * np.where(data != 0, 1, 0) * np.square(log_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) mae_on_log = float( np.sum(np.sum(mask * np.where(data != 0, 1, 0) * np.abs(log_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) mse_on_sqrt = float( np.sum( np.sum( mask * np.where(data != 0, 1, 0) * np.square(sqrt_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) mae_on_sqrt = float( np.sum(np.sum( mask * np.where(data != 0, 1, 0) * np.abs(sqrt_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) metric_results = { 'RMSE_sqrt': mse_on_sqrt**0.5, 'MAE_sqrt': mae_on_sqrt, 'RMSE_log': mse_on_log**0.5, 'MAE_log': mae_on_log } masked_locations = [] mask_values = mask.values for x in range(mask_values.shape[0]): for y in range(mask_values.shape[1]): if mask_values[x, y] == 1: masked_locations.append((x, y)) original_values = [] predicted_values = [] for (x, y) in masked_locations: original_values.append(data.iloc[x, y]) predicted_values.append(imputed_data.iloc[x, y]) original_values = np.asarray(original_values) predicted_values = np.asarray(predicted_values) predictions_df = pd.DataFrame({ 'original': original_values, 'predicted': predicted_values }) write_csv(predictions_df, os.path.join(result_dir, "files", "predictions.csv")) info.append({ 'filename': "predictions.csv", 'description': 'Original masked values along predicted values', 'plot_description': 'Predicted values vs. original masked values', }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) # Save results to a file result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") file.write("# GENE\tCELL\tGOLD_STANDARD\tRESULT:\n") for (x, y) in masked_locations: file.write("# %s\t%s\t%f\t%f\n" % (data.index.values[x], data.columns.values[y], data.iloc[x, y], imputed_data.iloc[x, y])) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results