コード例 #1
0
    def _load_data_and_imputed_data_for_evaluation(self, processed_count_file):
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        sparse_data, original_columns, column_permutation = load_gzip_pickle(
            hidden_data_file_path)
        data = sparse_data.to_dense()
        del sparse_data

        imputed_data = read_table_file(processed_count_file)

        # Restoring original column names
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Remove (error correction) ERCC and mitochondrial RNAs
        remove_list = [
            symbol for symbol in imputed_data.index.values
            if symbol.startswith("ERCC-") or symbol.startswith("mt-")
        ]

        imputed_data = imputed_data.drop(remove_list)
        data = data.drop(remove_list)

        return data, imputed_data
コード例 #2
0
def read_table_file(filename):
    if filename.endswith(".csv") or filename.endswith(".tsv") or \
            filename.endswith(".csv.gz") or filename.endswith(".tsv.gz"):
        return read_csv(filename)
    elif filename.endswith(".pkl.gz"):
        return load_gzip_pickle(filename)
    else:
        raise NotImplementedError("Unrecognized format for file %s" % filename)
コード例 #3
0
ファイル: numerical.py プロジェクト: moinfar/scFNN
    def _load_hidden_state(self):
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        sparse_data, read_ratio, original_columns, column_permutation = load_gzip_pickle(
            hidden_data_file_path)

        scaled_data = sparse_data.to_dense() * read_ratio

        return scaled_data, original_columns, column_permutation
コード例 #4
0
    def _load_hidden_state(self):
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        sparse_count_matrix, classes, original_columns, column_permutation = load_gzip_pickle(
            hidden_data_file_path)
        count_matrix = sparse_count_matrix.to_dense()

        del sparse_count_matrix

        return count_matrix, classes, original_columns, column_permutation
コード例 #5
0
ファイル: numerical.py プロジェクト: moinfar/scFNN
    def _load_hidden_state(self):
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        sparse_data, sparse_mask, original_columns, column_permutation = \
            load_gzip_pickle(hidden_data_file_path)
        data = sparse_data.to_dense()
        mask = sparse_mask.to_dense()

        del sparse_data
        del sparse_mask

        return data, mask, original_columns, column_permutation
コード例 #6
0
ファイル: paired_data.py プロジェクト: moinfar/scFNN
    def _load_hidden_state(self):
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        sparse_count_rna, original_columns, column_permutation, sparse_count_adt, protein_rna_mapping = \
            load_gzip_pickle(hidden_data_file_path)

        count_rna = sparse_count_rna.to_dense()
        count_adt = sparse_count_adt.to_dense()

        del sparse_count_rna
        del sparse_count_adt

        return count_rna, original_columns, column_permutation, count_adt, protein_rna_mapping
コード例 #7
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']
        clear_cache = kwargs['clear_cache']

        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        # Load hidden state and data
        count_matrix, classes, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Data transformations
        if np.sum(imputed_data.values < 0) > 0:
            log("Observed some negative values!")
            imputed_data[imputed_data < 0] = 0
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))

        # Save class details for future
        write_csv(classes, os.path.join(result_dir, "files", "classes.csv"))

        # Evaluation
        metric_results = dict()

        embedded_data_file_path = os.path.join(result_dir, "files",
                                               "embedded_data.pkl.gz")
        if os.path.exists(embedded_data_file_path) and not clear_cache:
            embedded_data = load_gzip_pickle(embedded_data_file_path)
        else:
            embedded_data = self._get_embeddings(imputed_data)
            dump_gzip_pickle(embedded_data, embedded_data_file_path)

        log("Evaluating ...")
        for class_label in classes.index.values:
            class_names = classes.loc[class_label].values
            for embedding_name in embedded_data:
                emb, emb_2d = embedded_data[embedding_name]

                embedding_slug = embedding_name.replace(" ", "_").lower()

                k_means = KMeans(n_clusters=len(set(class_names)))
                k_means.fit(emb)
                clusters = k_means.predict(emb)

                embedding_df = pd.DataFrame(emb)
                embedding_df["X"] = emb_2d[:, 0]
                embedding_df["Y"] = emb_2d[:, 1]
                embedding_df["class"] = class_names
                embedding_df["k_means_clusters"] = clusters
                write_csv(
                    embedding_df,
                    os.path.join(result_dir, "files",
                                 "%s_%s.csv" % (class_label, embedding_slug)))
                info.append({
                    'filename':
                    "%s_%s.csv" % (class_label, embedding_slug),
                    'description':
                    '%s embedding of cells along %s labels' %
                    (embedding_name, class_label),
                    'plot_description':
                    '%s embedding of cells along %s labels (Classes can be identified '
                    'with their colors and K-means clusters are marked '
                    'with different shapes)' % (embedding_name, class_label),
                })

                metric_results.update({
                    'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label):
                    adjusted_mutual_info_score(class_names,
                                               clusters,
                                               average_method="arithmetic"),
                    'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label):
                    v_measure_score(class_names, clusters),
                    'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label):
                    calinski_harabaz_score(emb, class_names),
                    'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label):
                    silhouette_score(emb, class_names)
                })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
コード例 #8
0
    def evaluate_result(self, processed_count_file, result_dir, visualization,
                        **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']
        clear_cache = kwargs['clear_cache']

        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        data, imputed_data = self._load_data_and_imputed_data_for_evaluation(
            processed_count_file)
        gold_standard_classes = [
            column_name.split("_")[0] for column_name in data.columns.values
        ]

        # Data transformations
        if np.sum(imputed_data.values < 0) > 0:
            log("Observed some negative values!")
            imputed_data[imputed_data < 0] = 0
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))

        G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data = self._get_related_part(
            imputed_data)

        related_part_of_imputed_data = pd.concat([
            G1_S_related_part_of_imputed_data,
            G2_M_related_part_of_imputed_data
        ])

        write_csv(
            G1_S_related_part_of_imputed_data,
            os.path.join(result_dir, "files",
                         "G1_S_related_part_of_imputed_data.csv"))
        info.append({
            'filename': 'G1_S_related_part_of_imputed_data.csv',
            'description': 'Vales of genes related to G1/S',
            'plot_description': 'Heatmap of Genes related to G1/S',
        })

        write_csv(
            G2_M_related_part_of_imputed_data,
            os.path.join(result_dir, "files",
                         "G2_M_related_part_of_imputed_data.csv"))
        info.append({
            'filename': 'G2_M_related_part_of_imputed_data.csv',
            'description': 'Vales of genes related to G2/M',
            'plot_description': 'Heatmap of Genes related to G2/M',
        })

        svm_results, knn_results = self._get_classification_results(
            related_part_of_imputed_data, gold_standard_classes)

        embedded_data_file_path = os.path.join(result_dir, "files",
                                               "embedded_data.pkl.gz")
        if os.path.exists(embedded_data_file_path) and not clear_cache:
            embedded_data = load_gzip_pickle(embedded_data_file_path)
        else:
            embedded_data = self._get_embeddings(related_part_of_imputed_data)
            dump_gzip_pickle(embedded_data, embedded_data_file_path)

        metric_results = {
            "classification_svm_mean_accuracy": np.mean(svm_results),
            "classification_knn_mean_accuracy": np.mean(knn_results)
        }

        embedded_data["identity"] = related_part_of_imputed_data.transpose()

        for i, embedding_name in enumerate(embedded_data):
            emb = embedded_data[embedding_name]

            k_means = KMeans(n_clusters=3)
            k_means.fit(emb)
            clusters = k_means.predict(emb)

            embedding_slug = embedding_name.replace(" ", "_").lower()

            if embedding_name != "identity":
                embedding_df = pd.DataFrame(
                    {
                        "X": emb[:, 0],
                        "Y": emb[:, 1],
                        "class": gold_standard_classes,
                        "k_means_clusters": clusters
                    },
                    index=data.columns.values)
                write_csv(
                    embedding_df,
                    os.path.join(result_dir, "files",
                                 "%s.csv" % embedding_slug))
                info.append({
                    'filename':
                    "%s.csv" % embedding_slug,
                    'description':
                    '%s embedding of cells considering genes related '
                    'to cell-cycle' % embedding_name,
                    'plot_description':
                    '%s embedding of cells considering genes related '
                    'to cell-cycle (K-means clusters are marked '
                    'with different shapes)' % embedding_name,
                })

            metric_results.update({
                'kmeans_on_%s_adjusted_mutual_info_score' % embedding_slug:
                adjusted_mutual_info_score(gold_standard_classes,
                                           clusters,
                                           average_method="arithmetic"),
                'kmeans_on_%s_v_measure_score' % embedding_slug:
                v_measure_score(gold_standard_classes, clusters),
                'embedding_%s_calinski_harabaz_score' % embedding_slug:
                calinski_harabaz_score(emb, gold_standard_classes),
                'embedding_%s_silhouette_score' % embedding_slug:
                silhouette_score(emb, gold_standard_classes)
            })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write("## SVM classifiers accuracies: %s\n" %
                       str(svm_results))
            file.write("## KNN classifiers accuracies: %s\n" %
                       str(knn_results))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results