Example #1
0
    def visualize_result(self, result_dir, output_type, **kwargs):
        info = read_table_file(os.path.join(result_dir, "files", "info.csv"))
        info = info.set_index("filename")

        classes = read_table_file(
            os.path.join(result_dir, "files", "classes.csv"))

        embeddings = ["PCA", "ICA", "Truncated SVD", "tSNE", "UMAP"]

        if output_type == "pdf":
            from plotly import graph_objs as go, io as pio

            for class_label in classes.index.values:
                class_names = classes.loc[class_label].astype("str").values
                for embedding_name in embeddings:
                    embedding_slug = embedding_name.replace(" ", "_").lower()
                    filename = "%s_%s.csv" % (class_label, embedding_slug)
                    embedded_df = read_table_file(
                        os.path.join(result_dir, "files", filename))

                    fig = go.Figure(layout=go.Layout(title=info.loc[filename]
                                                     ["plot_description"],
                                                     font=dict(size=8)))

                    color_scale = self._get_color_scales(class_names)
                    clusters = embedded_df["k_means_clusters"].values
                    X = embedded_df["X"].values
                    Y = embedded_df["Y"].values

                    for i, class_name in enumerate(
                            list(sorted(set(class_names)))):
                        indices = [
                            j for j, c in enumerate(class_names)
                            if c == class_name
                        ]
                        color = color_scale[i]
                        fig.add_scatter(x=X[indices],
                                        y=Y[indices],
                                        mode='markers',
                                        marker=dict(
                                            color=color,
                                            opacity=0.5,
                                            symbol=[
                                                ployly_symbols[c]
                                                for c in clusters[indices]
                                            ]),
                                        name=class_name)

                    pio.write_image(fig,
                                    os.path.join(
                                        result_dir, "plot_%s_%s.pdf" %
                                        (class_label, embedding_slug)),
                                    width=800,
                                    height=600)
        elif output_type == "html":
            raise NotImplementedError()
        else:
            raise NotImplementedError()
Example #2
0
    def visualize_result(self, result_dir, output_type, **kwargs):
        info = read_table_file(os.path.join(result_dir, "files", "info.csv"))
        info = info.set_index("filename")

        spearman_correlations = read_table_file(
            os.path.join(result_dir, "files", "spearman_correlations.csv"))
        pearson_correlations = read_table_file(
            os.path.join(result_dir, "files", "pearson_correlations.csv"))

        n = spearman_correlations.shape[0] // 2

        adt_adt_spearmanr = spearman_correlations.iloc[:n, :n]
        rna_rna_spearmanr = spearman_correlations.iloc[n:, n:]
        adt_rna_spearmanr = spearman_correlations.iloc[:n, n:]

        adt_adt_pearsonr = pearson_correlations.iloc[:n, :n]
        rna_rna_pearsonr = pearson_correlations.iloc[n:, n:]
        adt_rna_pearsonr = pearson_correlations.iloc[:n, n:]

        if output_type == "pdf":
            import plotly.graph_objs as go
            import plotly.io as pio

            plots = [
                ("Pairwise Spearman correlations between ADT values",
                 adt_adt_spearmanr, "heatmap_adt_adt_spearmanr.pdf"),
                ("Pairwise Spearman correlations between RNA values",
                 rna_rna_spearmanr, "heatmap_rna_rna_spearmanr.pdf"),
                ("Pairwise Spearman correlations between ADT and RNA values",
                 adt_rna_spearmanr, "heatmap_adt_rna_spearmanr.pdf"),
                ("Pairwise Pearson correlations between ADT values",
                 adt_adt_pearsonr, "heatmap_adt_adt_pearsonr.pdf"),
                ("Pairwise Pearson correlations between RNA values",
                 rna_rna_pearsonr, "heatmap_rna_rna_pearsonr.pdf"),
                ("Pairwise Pearson correlations between ADT and RNA values",
                 adt_rna_pearsonr, "heatmap_adt_rna_pearsonr.pdf")
            ]

            for title, data_frame, filename in plots:
                fig = go.Figure(
                    layout=go.Layout(title=title, font=dict(size=9)))

                fig.add_heatmap(z=data_frame.values,
                                x=data_frame.columns.values,
                                y=data_frame.index.values,
                                colorscale='Picnic')  # RdBu is also good

                pio.write_image(fig,
                                os.path.join(result_dir, filename),
                                width=600,
                                height=700)

        elif output_type == "html":
            print("Nothing to visualize")
        else:
            raise NotImplementedError()
Example #3
0
    def _load_data_and_imputed_data_for_evaluation(self, processed_count_file):
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        sparse_data, original_columns, column_permutation = load_gzip_pickle(
            hidden_data_file_path)
        data = sparse_data.to_dense()
        del sparse_data

        imputed_data = read_table_file(processed_count_file)

        # Restoring original column names
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Remove (error correction) ERCC and mitochondrial RNAs
        remove_list = [
            symbol for symbol in imputed_data.index.values
            if symbol.startswith("ERCC-") or symbol.startswith("mt-")
        ]

        imputed_data = imputed_data.drop(remove_list)
        data = data.drop(remove_list)

        return data, imputed_data
Example #4
0
    def visualize_result(self, result_dir, output_type, **kwargs):
        info = read_table_file(os.path.join(result_dir, "files", "info.csv"))
        info = info.set_index("filename")

        predictions = read_table_file(
            os.path.join(result_dir, "files", "predictions.csv"))
        original_values = predictions["original"]
        predicted_values = predictions["predicted"]

        if output_type == "pdf":
            import plotly.graph_objs as go
            import plotly.io as pio

            max_axis = float(max(original_values.max(),
                                 predicted_values.max()))
            for transformation_name in ["log", "sqrt"]:
                transformation = transformations[transformation_name]

                fig = go.Figure(layout=go.Layout(
                    title=
                    'Predicted values vs. original masked values (%s scale)' %
                    transformation_name,
                    font=dict(size=12),
                    xaxis=go.layout.XAxis(
                        range=[0, transformation(max_axis)]),
                    yaxis=go.layout.YAxis(
                        range=[0, transformation(max_axis)])))
                fig.add_scatter(x=transformation(original_values),
                                y=transformation(predicted_values),
                                mode='markers',
                                marker=dict(opacity=0.3))
                pio.write_image(
                    fig,
                    os.path.join(
                        result_dir,
                        "prediction_plot_%s_scale.pdf" % transformation_name),
                    width=800,
                    height=800)
        elif output_type == "html":
            raise NotImplementedError()
        else:
            raise NotImplementedError()
Example #5
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']
        clear_cache = kwargs['clear_cache']

        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        # Load hidden state and data
        count_matrix, classes, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Data transformations
        if np.sum(imputed_data.values < 0) > 0:
            log("Observed some negative values!")
            imputed_data[imputed_data < 0] = 0
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))

        # Save class details for future
        write_csv(classes, os.path.join(result_dir, "files", "classes.csv"))

        # Evaluation
        metric_results = dict()

        embedded_data_file_path = os.path.join(result_dir, "files",
                                               "embedded_data.pkl.gz")
        if os.path.exists(embedded_data_file_path) and not clear_cache:
            embedded_data = load_gzip_pickle(embedded_data_file_path)
        else:
            embedded_data = self._get_embeddings(imputed_data)
            dump_gzip_pickle(embedded_data, embedded_data_file_path)

        log("Evaluating ...")
        for class_label in classes.index.values:
            class_names = classes.loc[class_label].values
            for embedding_name in embedded_data:
                emb, emb_2d = embedded_data[embedding_name]

                embedding_slug = embedding_name.replace(" ", "_").lower()

                k_means = KMeans(n_clusters=len(set(class_names)))
                k_means.fit(emb)
                clusters = k_means.predict(emb)

                embedding_df = pd.DataFrame(emb)
                embedding_df["X"] = emb_2d[:, 0]
                embedding_df["Y"] = emb_2d[:, 1]
                embedding_df["class"] = class_names
                embedding_df["k_means_clusters"] = clusters
                write_csv(
                    embedding_df,
                    os.path.join(result_dir, "files",
                                 "%s_%s.csv" % (class_label, embedding_slug)))
                info.append({
                    'filename':
                    "%s_%s.csv" % (class_label, embedding_slug),
                    'description':
                    '%s embedding of cells along %s labels' %
                    (embedding_name, class_label),
                    'plot_description':
                    '%s embedding of cells along %s labels (Classes can be identified '
                    'with their colors and K-means clusters are marked '
                    'with different shapes)' % (embedding_name, class_label),
                })

                metric_results.update({
                    'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label):
                    adjusted_mutual_info_score(class_names,
                                               clusters,
                                               average_method="arithmetic"),
                    'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label):
                    v_measure_score(class_names, clusters),
                    'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label):
                    calinski_harabaz_score(emb, class_names),
                    'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label):
                    silhouette_score(emb, class_names)
                })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Example #6
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']

        # Load hidden state and data
        count_matrix_lq, original_columns, column_permutation, count_matrix_hq = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Replace negative values with zero
        imputed_data = imputed_data.clip(lower=0)

        # Data transformations
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))
        count_matrix_hq = transformations[transformation](
            normalizations[normalization](count_matrix_hq))

        # Evaluation
        rmse_distances = []
        mae_distances = []
        euclidean_distances = []
        cosine_distances = []
        correlation_distances = []

        for i in range(count_matrix_hq.shape[1]):
            non_zeros = np.logical_and(count_matrix_hq.values[:, i] > 0,
                                       count_matrix_lq.values[:, i] == 0)
            hq = count_matrix_hq.values[non_zeros, i]
            lq = count_matrix_lq.values[non_zeros, i]
            y = imputed_data.values[non_zeros, i]
            if np.sum(y) > 0:
                y = y * np.sum(hq) / np.sum(y)
            rmse_distances.append(float(np.mean(np.square(hq - y)**0.5)))
            mae_distances.append(float(np.mean(np.abs(hq - y))))
            euclidean_distances.append(
                pdist(np.vstack((hq, y)), 'euclidean')[0])
            cosine_distances.append(pdist(np.vstack((hq, y)), 'cosine')[0])
            correlation_distances.append(
                pdist(np.vstack((hq, y)), 'correlation')[0])

        metric_results = {
            'cell_root_mean_squared_error': np.mean(rmse_distances),
            'cell_mean_absolute_error': np.mean(mae_distances),
            'cell_mean_euclidean_distance': np.mean(euclidean_distances),
            'cell_mean_cosine_distance': np.mean(cosine_distances),
            'cell_mean_correlation_distance': np.mean(correlation_distances)
        }

        # Save results to a file
        make_sure_dir_exists(os.path.join(result_dir, "files"))
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" %
                           (metric, float(metric_results[metric])))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write(
                "# CELL\troot_mean_squared_error\tmean_absolute_error\tmean_euclidean_distance\t"
                "mean_cosine_distance\tmean_correlation_distance:\n")
            for i in range(count_matrix_hq.shape[1]):
                file.write(
                    "# %s\t%f\t%f\t%f\t%f\t%f\n" %
                    (count_matrix_hq.columns.values[i], rmse_distances[i],
                     mae_distances[i], euclidean_distances[i],
                     cosine_distances[i], correlation_distances[i]))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Example #7
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        make_sure_dir_exists(os.path.join(result_dir, "files"))

        transformation = kwargs['transformation']

        # Load hidden state and data
        _, original_columns, column_permutation, count_adt, protein_rna_mapping = self._load_hidden_state(
        )

        # Load imputed data
        imputed_rna = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_rna = rearrange_and_rename_columns(imputed_rna,
                                                   original_columns,
                                                   column_permutation)

        # Data transformations
        imputed_rna = transformations[transformation](imputed_rna)
        count_adt = transformations[transformation](count_adt)

        # Use related data
        adt = count_adt.loc[[
            prot for prot in count_adt.index.values
            if (protein_rna_mapping[prot] in imputed_rna.index.values)
        ]].copy()
        adt.index = ["prot_" + p for p in adt.index.values]
        rna = imputed_rna.loc[[
            protein_rna_mapping[prot] for prot in count_adt.index.values
            if (protein_rna_mapping[prot] in imputed_rna.index.values)
        ]]
        rna.index = ["gene_" + g for g in rna.index.values]

        info = []

        write_csv(adt, os.path.join(result_dir, "files", "adt.csv"))
        info.append({
            'filename':
            "adt.csv",
            'description':
            'Protein expressions (adt) after transformation',
            'plot_description':
            'Protein expressions (adt) after transformation',
        })

        write_csv(rna, os.path.join(result_dir, "files", "rna.csv"))
        info.append({
            'filename':
            "rna.csv",
            'description':
            'Gene expressions of genes related to adt data after transformation',
            'plot_description':
            'Gene expressions of genes related to adt data after transformation',
        })

        n = adt.shape[0]

        # Calculating Spearman correlations
        combined_df = pd.concat((adt, rna)).transpose()
        correlations = combined_df.corr(method="spearman")

        adt_adt_spearmanr = correlations.iloc[:n, :n]
        rna_rna_spearmanr = correlations.iloc[n:, n:]
        adt_rna_spearmanr = correlations.iloc[:n, n:]

        write_csv(
            correlations,
            os.path.join(result_dir, "files", "spearman_correlations.csv"))
        info.append({
            'filename':
            "spearman_correlations.csv",
            'description':
            'Pairwise Spearman correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
            'plot_description':
            'Pairwise Spearman correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
        })

        # Calculating Pearson correlations
        combined_df = pd.concat((adt, rna)).transpose()
        correlations = combined_df.corr(method="pearson")

        adt_adt_pearsonr = correlations.iloc[:n, :n]
        rna_rna_pearsonr = correlations.iloc[n:, n:]
        adt_rna_pearsonr = correlations.iloc[:n, n:]

        write_csv(
            correlations,
            os.path.join(result_dir, "files", "pearson_correlations.csv"))
        info.append({
            'filename':
            "pearson_correlations.csv",
            'description':
            'Pairwise Pearson correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
            'plot_description':
            'Pairwise Pearson correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
        })

        # Evaluation
        metric_results = {
            'rna_protein_mean_spearman_correlatoin':
            np.mean(adt_rna_spearmanr.values.diagonal()),
            'rna_protein_mean_pearson_correlatoin':
            np.mean(adt_rna_pearsonr.values.diagonal()),
            'MSE_of_adt_adt_and_rna_rna_spearman_correlations':
            np.mean((adt_adt_spearmanr.values - rna_rna_spearmanr.values)**2),
            'MSE_of_adt_adt_and_rna_rna_pearson_correlations':
            np.mean((adt_adt_pearsonr.values - rna_rna_pearsonr.values)**2)
        }

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        # Save results to a file
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" %
                           (metric, float(metric_results[metric])))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write("## Pearson of adt/rna:\n")
            file.write("## " +
                       "\n## ".join(adt_rna_pearsonr.to_string().split("\n")) +
                       "\n")
            file.write('## Spearman of adt/rna:\n')
            file.write(
                "## " +
                "\n## ".join(adt_rna_spearmanr.to_string().split("\n")) + "\n")
            file.write("## Pearson of adt/adt:\n")
            file.write("## " +
                       "\n## ".join(adt_adt_pearsonr.to_string().split("\n")) +
                       "\n")
            file.write("## Pearson of rna/rna:\n")
            file.write("## " +
                       "\n## ".join(rna_rna_pearsonr.to_string().split("\n")) +
                       "\n")
            file.write('## Spearman of adt/adt:\n')
            file.write(
                "## " +
                "\n## ".join(adt_adt_spearmanr.to_string().split("\n")) + "\n")
            file.write('## Spearman of rna/rna:\n')
            file.write(
                "## " +
                "\n## ".join(rna_rna_spearmanr.to_string().split("\n")) + "\n")

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Example #8
0
    def visualize_result(self, result_dir, output_type, **kwargs):
        info = read_table_file(os.path.join(result_dir, "files", "info.csv"))
        info = info.set_index("filename")

        G1_S_related_part_of_imputed_data = read_table_file(
            os.path.join(result_dir, "files",
                         "G1_S_related_part_of_imputed_data.csv"))
        G2_M_related_part_of_imputed_data = read_table_file(
            os.path.join(result_dir, "files",
                         "G2_M_related_part_of_imputed_data.csv"))

        embeddings = ["PCA", "ICA", "Truncated SVD", "tSNE", "UMAP"]
        embedded_dfs = dict()
        for embedding_name in embeddings:
            embedding_slug = embedding_name.replace(" ", "_").lower()
            embedded_dfs[embedding_name] = read_table_file(
                os.path.join(result_dir, "files", "%s.csv" % embedding_slug))

        if output_type == "pdf":
            import plotly.graph_objs as go
            import plotly.io as pio

            G1_S_heatmap_fig = go.Figure(layout=go.Layout(
                title='Heatmap of Genes related to G1/S',
                font=dict(size=5),
                xaxis=dict(title='Marker Genes', tickangle=60)))
            G2_M_heatmap_fig = go.Figure(layout=go.Layout(
                title='Heatmap of Genes related to G2/M',
                font=dict(size=5),
                xaxis=dict(title='Marker Genes', tickangle=60)))

            def normalize(df):
                return df.subtract(df.mean(axis=1),
                                   axis=0).divide(df.std(axis=1), axis=0)

            G1_S_heatmap_fig.add_heatmap(
                z=normalize(G1_S_related_part_of_imputed_data).values.T,
                x=G1_S_related_part_of_imputed_data.index.values,
                y=G1_S_related_part_of_imputed_data.columns.values,
                colorscale='Viridis')
            G2_M_heatmap_fig.add_heatmap(
                z=normalize(G2_M_related_part_of_imputed_data).values.T,
                x=G2_M_related_part_of_imputed_data.index.values,
                y=G2_M_related_part_of_imputed_data.columns.values,
                colorscale='Viridis')

            pio.write_image(G1_S_heatmap_fig,
                            os.path.join(
                                result_dir,
                                "plot_G1_S_related_genes_heatmap.pdf"),
                            width=600,
                            height=700)
            pio.write_image(G2_M_heatmap_fig,
                            os.path.join(
                                result_dir,
                                "plot_G2_M_related_genes_heatmap.pdf"),
                            width=600,
                            height=700)

            embeddings = ["PCA", "ICA", "Truncated SVD", "tSNE", "UMAP"]
            for i, embedding_name in enumerate(embeddings):
                embedding_slug = embedding_name.replace(" ", "_").lower()

                fig = go.Figure(layout=go.Layout(
                    title='%s embedding of cells considering genes related '
                    'to cell-cycle (K-means clusters are marked '
                    'with different shapes)' % embedding_name,
                    font=dict(size=8)))

                embedding_df = embedded_dfs[embedding_name]
                X = embedding_df["X"].values
                Y = embedding_df["Y"].values
                classes = embedding_df["class"].values
                clusters = embedding_df["k_means_clusters"].values

                for j, state in enumerate(["G1", "G2M", "S"]):
                    indices = [k for k, c in enumerate(classes) if c == state]
                    fig.add_scatter(x=X[indices],
                                    y=Y[indices],
                                    mode='markers',
                                    marker=dict(color=["red", "green",
                                                       "blue"][j],
                                                symbol=[[
                                                    "circle-open", "diamond",
                                                    "cross"
                                                ][c] for c in clusters[indices]
                                                        ]),
                                    name="%s Phase" % state)

                pio.write_image(fig,
                                os.path.join(result_dir,
                                             "plot_%s.pdf" % embedding_slug),
                                width=800,
                                height=600)

        elif output_type == "html":
            raise NotImplementedError()
        else:
            raise NotImplementedError()
Example #9
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        transformation = kwargs['transformation']

        make_sure_dir_exists(os.path.join(result_dir, "files"))

        # Load hidden state and data
        scaled_data, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Replace negative values with zero
        imputed_data = imputed_data.clip(lower=0)

        # Data transformation
        scaled_data = transformations[transformation](scaled_data)
        imputed_data = transformations[transformation](imputed_data)

        # Evaluation
        rmse_distances = []
        mae_distances = []
        euclidean_distances = []
        cosine_distances = []
        correlation_distances = []

        rmse = float(
            np.sum(
                np.where(scaled_data.values > 0, 1, 0) *
                np.square(scaled_data.values - imputed_data.values)) /
            np.sum(np.where(scaled_data.values > 0, 1, 0)))**0.5
        mae = float(
            np.sum(
                np.where(scaled_data.values > 0, 1, 0) *
                np.abs(scaled_data.values - imputed_data.values)) /
            np.sum(np.where(scaled_data.values > 0, 1, 0)))

        for i in range(scaled_data.shape[1]):
            non_zeros = scaled_data.values[:, i] > 0
            x = scaled_data.values[non_zeros, i]
            y = imputed_data.values[non_zeros, i]
            rmse_distances.append(
                float(np.sum(np.square(x - y)) / np.sum(non_zeros))**0.5)
            mae_distances.append(
                float(np.sum(np.abs(x - y)) / np.sum(non_zeros)))
            cosine_distances.append(pdist(np.vstack((x, y)), 'cosine')[0])
            euclidean_distances.append(
                pdist(np.vstack((x, y)), 'euclidean')[0])
            correlation_distances.append(
                pdist(np.vstack((x, y)), 'correlation')[0])

        metric_results = {
            'all_mean_absolute_error_on_non_zeros':
            mae,
            'all_root_mean_squared_error_on_non_zeros':
            rmse,
            'cell_mean_mean_absolute_error_on_non_zeros':
            np.mean(mae_distances),
            'cell_mean_root_mean_squared_error_on_non_zeros':
            np.mean(rmse_distances),
            'cell_mean_euclidean_distance':
            np.mean(euclidean_distances),
            'cell_mean_cosine_distance':
            np.mean(cosine_distances),
            'cell_mean_correlation_distance':
            np.mean(correlation_distances),
        }

        # Save results to a file
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" %
                           (metric, float(metric_results[metric])))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write(
                "# CELL\troot_mean_squared_error_on_non_zeros\tmean_absolute_error_on_non_zeros\t"
                "euclidean_distance_on_non_zeros\tcosine_distance_on_non_zeros\tcorrelation_distance_on_non_zeros:\n"
            )
            for i in range(scaled_data.shape[1]):
                file.write("# %s\t%f\t%f\t%f\t%f\t%f\n" %
                           (scaled_data.columns.values[i], rmse_distances[i],
                            mae_distances[i], euclidean_distances[i],
                            cosine_distances[i], correlation_distances[i]))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Example #10
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        # Load hidden state and data
        data, mask, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Replace negative values with zero
        imputed_data = imputed_data.clip(lower=0)

        # Evaluation
        log_diff = np.abs(transformations["log"](data) -
                          transformations["log"](imputed_data))
        sqrt_diff = np.abs(transformations["sqrt"](data) -
                           transformations["sqrt"](imputed_data))

        mse_on_log = float(
            np.sum(
                np.sum(mask * np.where(data != 0, 1, 0) * np.square(log_diff)))
            / np.sum(np.sum(mask * np.where(data != 0, 1, 0))))
        mae_on_log = float(
            np.sum(np.sum(mask * np.where(data != 0, 1, 0) * np.abs(log_diff)))
            / np.sum(np.sum(mask * np.where(data != 0, 1, 0))))
        mse_on_sqrt = float(
            np.sum(
                np.sum(
                    mask * np.where(data != 0, 1, 0) * np.square(sqrt_diff))) /
            np.sum(np.sum(mask * np.where(data != 0, 1, 0))))
        mae_on_sqrt = float(
            np.sum(np.sum(
                mask * np.where(data != 0, 1, 0) * np.abs(sqrt_diff))) /
            np.sum(np.sum(mask * np.where(data != 0, 1, 0))))

        metric_results = {
            'RMSE_sqrt': mse_on_sqrt**0.5,
            'MAE_sqrt': mae_on_sqrt,
            'RMSE_log': mse_on_log**0.5,
            'MAE_log': mae_on_log
        }

        masked_locations = []
        mask_values = mask.values
        for x in range(mask_values.shape[0]):
            for y in range(mask_values.shape[1]):
                if mask_values[x, y] == 1:
                    masked_locations.append((x, y))

        original_values = []
        predicted_values = []
        for (x, y) in masked_locations:
            original_values.append(data.iloc[x, y])
            predicted_values.append(imputed_data.iloc[x, y])

        original_values = np.asarray(original_values)
        predicted_values = np.asarray(predicted_values)

        predictions_df = pd.DataFrame({
            'original': original_values,
            'predicted': predicted_values
        })
        write_csv(predictions_df,
                  os.path.join(result_dir, "files", "predictions.csv"))
        info.append({
            'filename':
            "predictions.csv",
            'description':
            'Original masked values along predicted values',
            'plot_description':
            'Predicted values vs. original masked values',
        })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        # Save results to a file
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write("# GENE\tCELL\tGOLD_STANDARD\tRESULT:\n")
            for (x, y) in masked_locations:
                file.write("# %s\t%s\t%f\t%f\n" %
                           (data.index.values[x], data.columns.values[y],
                            data.iloc[x, y], imputed_data.iloc[x, y]))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results