def perform_data_preprocessing(self):
        #
        # Perform the preprocessing steps
        #
        pp_start_time = time.time()

        # perform log2 transformation over the input data
        if self.perform_log2:
            self.input_data = np.log2(self.input_data,
                                      where=(self.input_data != 0.0))
            logging_print("Log 2 transformation is performed on the input data")

        # perform center scaling over the input data
        if self.pre_processing_center_scale:
            scaler = StandardScaler()
            if self.over_samples:
                scaled_np_data = scaler.fit_transform(self.input_data)
            else:
                # Transform the dataset before and after scaling
                scaled_np_data = scaler.fit_transform(self.input_data.T).T

            # The method return a Numpy dataframe. Create a new Pandas DataFrame
            self.input_data = pd.DataFrame(scaled_np_data,
                                           index=self.input_data.index,
                                           columns=self.input_data.columns)
            logging_print("Centering and scaling is performed on the input data")

        timer_print(pp_start_time,
                    prefix="Data pre-processing ready",
                    time_overview_log=self.process_time_overview)
Example #2
0
    def calculate_wilcox_p_value(self):
        cwpv_st_time = time.time()
        wal_p_values = {}
        for pathway_id in self.auc_values.index:
            if pathway_id in list(self.matrix_data.columns):
                selected_pathway = self.pathway_gene_scores.loc[
                    self.matrix_data.index, pathway_id]
                include_in_pathway = selected_pathway[
                    self.matrix_data.loc[:, pathway_id]]
                include_not_in_pathway = selected_pathway[~self.matrix_data.
                                                          loc[:, pathway_id]]
                try:
                    _, p_value = scipy.stats.mannwhitneyu(
                        include_in_pathway,
                        include_not_in_pathway,
                        use_continuity=True,
                        alternative="two-sided")
                    wal_p_values[pathway_id] = p_value
                except ValueError:
                    logging_print(
                        "Value error in p value calculation, pvalue 0.0 is set to pathway: {}"
                        .format(pathway_id))
                    wal_p_values[pathway_id] = 0.0

        self.p_values = pd.Series(wal_p_values)
        timer_print(cwpv_st_time, prefix="Wilcox p value calculation ready")
Example #3
0
    def calculate_centriods(self):
        calc_cent_start_time = time.time()
        logging_print("Start calculation of the centriods")
        initial_df = self.overall_components[0]
        gene_names = initial_df.columns
        n_components = initial_df.shape[0]
        initial_np = initial_df.to_numpy()
        signs = np.sign(initial_np)
        initial_np = np.abs(initial_np)
        processed_dfs = [initial_np * signs]

        for ica_df in self.overall_components[1:]:
            ica_df = ica_df.loc[:, gene_names]
            ica_df = ica_df.iloc[:n_components, :]
            corr_table = np.abs(
                np.corrcoef(initial_np, np.abs(ica_df.to_numpy())))
            corr_table = corr_table[n_components:, :n_components]
            max_indexes_columns = np.argmax(corr_table, axis=0)

            ica_df = ica_df.iloc[max_indexes_columns, :]
            processed_dfs.append(ica_df.to_numpy())

        centriods_array = np.c_[np.abs(processed_dfs) * signs]

        centriods = centriods_array.mean(axis=0)
        centriod_df = pd.DataFrame(centriods,
                                   index=initial_df.index,
                                   columns=initial_df.columns)

        timer_print(calc_cent_start_time,
                    prefix="Centroid calculation is ready")
        return centriod_df
    def read_file(self):
        #
        # Method to read all the input files
        #

        # save the start time
        rf_start_time = time.time()

        # Read the input file if present
        if os.path.isfile(self.input_file_path):
            if self.test_run:
                # Read a test dataset, so only the first 150 columns and rows
                self.input_data = pd.read_csv(self.input_file_path,
                                              sep="\t", index_col=0,
                                              nrows=150)
                self.input_data = self.input_data.iloc[:150, :100]
            else:
                # Check if only a part of the rows must be loaded instead
                # of the complete dataset
                n_rows = None
                if self.n_rows != None and self.n_rows != '':
                    n_rows = int(self.n_rows)

                # read the input file, This can be a (cahsed) Pandas pickle
                # file or a tab seperated text file which can be compressed.
                # If Force is set to true, the method will not read cashed
                # pickle files created from the original txt matrix if these
                # are present. By default the cashed version will be
                # loaded (which contain the suffix _cashed.pickle) if this file
                # is present in the same directory as the input matrix

                self.input_data = read_pd_df(
                    self.input_file_path,
                    {
                        "sep": "\t",
                        "index_col": 0,
                        "nrows": n_rows
                    },
                    force=self.force)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.input_file_path
            ))

        # if the number of components is not set, we will set this to
        # the smallest direction of the input matrix
        if self.n_components is None:
            self.n_components = np.min(self.input_data.shape)

        # log some basic info
        logging_print(stats_dict_to_string({
            "Input dataframe n_row": self.input_data.shape[0],
            "Input dataframe n_col": self.input_data.shape[1],
            "first column headers": self.input_data.columns.values[:5],
            "first row index": self.input_data.index.values[:5]
        }))
        timer_print(rf_start_time,
                    prefix="Reading input file ready",
                    time_overview_log=self.process_time_overview)
Example #5
0
 def perform_data_whitening(self, data):
     pdw_start_time = time.time()
     logging_print("FastICA, start data whitening")
     whited_svd_object = SVD_wrapper(svd_type=self.svd_type,
                                     n_components=self.n_components,
                                     white_data=True)
     self.whiten_data = whited_svd_object.fit_transform(data).to_numpy()
     self.whiten_components = whited_svd_object.components.to_numpy()
     timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
Example #6
0
 def perform_filtering(self):
     if self.minimal_number_of_genes > 0:
         pf_start_time = time.time()
         matrix_selection = self.matrix_data.sum(
             axis=0) >= self.minimal_number_of_genes
         self.matrix_data = self.matrix_data.loc[:, matrix_selection[
             matrix_selection].index]
         logging_print(
             "Minimal gene filtering: {} number of pathways over".format(
                 self.matrix_data.shape[1]))
         timer_print(pf_start_time,
                     prefix="Minimal gene in pathway filtering ready")
 def perform_data_whitening(self, data):
     # Method to perform a manual data whitening step, which is the
     # first stap in the fastICA analysis
     pdw_start_time = time.time()
     logging_print("FastICA, start data whitening")
     # Use the SVD wrapper to perform the whitening step based on a PCA.
     whited_svd_object = SVD_wrapper(svd_type=self.svd_type,
                                     n_components=self.n_components,
                                     white_data=True)
     # Save the results
     self.whiten_data = whited_svd_object.fit_transform(data).to_numpy()
     self.whiten_components = whited_svd_object.components.to_numpy()
     timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
Example #8
0
    def fit_auto_white(self, data):
        logging_print("Use fastICA with auto whiten")
        fastICA_object = FastICA(n_components=self.n_components,
                                 algorithm="parallel",
                                 fun='logcosh',
                                 max_iter=500,
                                 tol=1e-10)

        sources = fastICA_object.fit_transform(data)
        self.projected_data = sources
        self.components = fastICA_object.components_

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
Example #9
0
 def write_base_loginfo(self):
     info = "## START GENE PATHWAY ANALYSIS (new p value method) ##\n" \
            "DATE:\t{date}\n" \
            "Component file:\t{component_file}\n" \
            "Matrix file:\t{matrix_file}\n" \
            "Background gene file:\t{background_gene_file}\n" \
            "Output dir:\t{output_dir}\n" \
            "Analysis type:\t{analysis_type}\n" \
            "Number of cores:\t{num_cores}\n" \
            "".format(date=datetime.now(),
                      component_file=self.components_data_path,
                      matrix_file=self.matrix_path,
                      background_gene_file=self.background_genes_path,
                      output_dir=self.output_dir,
                      analysis_type=self.analysis_type,
                      num_cores=self.n_cores)
     logging_print(info)
    def fit(self, data):
        # Method to fit the FastICA models

        # Check if the whitening step is already done
        if self.whiten_components is None:
            # Perfrom the whitening step
            self.perform_data_whitening(data)
        fit_start_time = time.time()

        # Create the FastICA object from sklearn without performing
        # the whiten step
        fastICA_object = FastICA(algorithm="parallel",
                                 whiten=False,
                                 fun='logcosh',
                                 max_iter=self.max_iter,
                                 tol=1e-10)
        # Fit the model
        fastICA_object.fit(self.whiten_data[:, :self.n_components])

        # Calculate the independend components and the sources and save
        # the results
        indep_comp = np.dot(fastICA_object.components_,
                            self.whiten_components[:self.n_components, :])
        indep_sources = np.dot(indep_comp, data.to_numpy().T).T

        components_index = pd.RangeIndex(start=1,
                                         stop=self.n_components + 1,
                                         name="IC")

        indep_sources_df = pd.DataFrame(
            indep_sources, index=data.index,
            columns=components_index).add_prefix("IC_")

        indep_comp_df = pd.DataFrame(
            indep_comp, index=components_index,
            columns=data.columns).T.add_prefix("IC_").T

        self.projected_data = indep_sources_df
        self.components = indep_comp_df

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
        timer_print(fit_start_time,
                    prefix="FastICA component optimalisation is ready")
Example #11
0
 def write_base_loginfo(self):
     logging_print(
         stats_dict_to_string({
             "## START DECOMPOSITION ##": "",
             "DATE": datetime.now(),
             "Input file": self.input_file_path,
             "Output dir": self.output_dir,
             "Analysis type": self.analysis_type,
             "Over samples": self.over_samples,
             "Test run": self.test_run,
             "FastICA max iter": self.fastICA_max_iter,
             "Number of components": self.n_components,
             "Number of rows": self.n_rows,
             "Perform log2 transformation": self.perform_log2,
             "Perform centering and scaling":
             self.pre_processing_center_scale,
             "Force": self.force
         }))
Example #12
0
    def create_permutations_matrixes(self):
        perm_start_time = time.time()
        permutation_path = "{}_permutation_matrix.pkl".format(
            self.components_data_path)

        if os.path.isfile(permutation_path) and self.force == False:
            self.components_data_permutated = pd.read_pickle(permutation_path)
            logging_print("Permutation dataframe is loaded from '{}', "
                          "shape: {}".format(
                              permutation_path,
                              self.components_data_permutated.shape))
        else:
            self.components_data_permutated = self.components_data.set_index(
                np.random.permutation(self.components_data.index))
            self.components_data_permutated.to_pickle(permutation_path)
            logging_print("Permutation table is created, shape: {}".format(
                self.components_data_permutated.shape))
        timer_print(perm_start_time, prefix="permutation is ready")
    def calculate_centriods(self):
        # Method to calculate the centriods, e.q. the average components from
        # the components extracted from different individual runs
        calc_cent_start_time = time.time()
        logging_print("Start calculation of the centriods")
        # start position, extracted from the first run
        initial_df = self.overall_components[0]
        gene_names = initial_df.columns
        n_components = initial_df.shape[0]
        initial_np = initial_df.to_numpy()
        signs = np.sign(initial_np)
        initial_np = np.abs(initial_np)
        processed_dfs = [initial_np * signs]
        # loop through all the component runs
        for ica_df in self.overall_components[1:]:
            # Sort the components based on the highest correlation between the
            # components and the component from the first run. this resulted in
            # the same order of the fastica components over all runs.
            # By default, the components of a fastICA are random ordered
            ica_df = ica_df.loc[:, gene_names]
            ica_df = ica_df.iloc[:n_components, :]
            # calculate the correlation
            corr_table = np.abs(
                np.corrcoef(initial_np, np.abs(ica_df.to_numpy())))
            corr_table = corr_table[n_components:, :n_components]

            # find the order based on the maximal correlation
            max_indexes_columns = np.argmax(corr_table, axis=0)
            # Order the components
            ica_df = ica_df.iloc[max_indexes_columns, :]
            # Save the components in the right order
            processed_dfs.append(ica_df.to_numpy())

        # Calculate the centriods e.q. average components
        centriods_array = np.c_[np.abs(processed_dfs) * signs]
        centriods = centriods_array.mean(axis=0)
        centriod_df = pd.DataFrame(centriods,
                                   index=initial_df.index,
                                   columns=initial_df.columns)

        timer_print(calc_cent_start_time,
                    prefix="Centroid calculation is ready")
        # return the average components
        return centriod_df
Example #14
0
    def perform_multinode_processing(self):
        # handle new start end stop
        if self.split_end is not None:
            self.matrix_data = self.matrix_data.iloc[:, :self.split_end]

        if self.split_start is not None:
            self.matrix_data = self.matrix_data.iloc[:, self.split_start:]

        if self.split_start is not None or self.split_end is not None:
            logging_print("Trim matrix, new start: {start}, "
                          "new end: {end}, dataframe size: {df_size}".format(
                              start=self.split_start,
                              end=self.split_end,
                              df_size=self.matrix_data.shape))

        # handle multi node processing on a cluster
        if self.multi_node_num_nodes is not None and \
                self.multi_node_node_id is not None:
            num_pathways = self.matrix_data.shape[1]
            pathways_per_node = num_pathways // self.multi_node_num_nodes
            if pathways_per_node * self.multi_node_num_nodes < num_pathways:
                pathways_per_node += 1

            start_id = self.multi_node_node_id * pathways_per_node
            end_id = (self.multi_node_node_id + 1) * pathways_per_node
            if end_id > num_pathways:
                end_id = num_pathways

            self.matrix_data = self.matrix_data.iloc[:, start_id:end_id]

            logging_print(
                "## Process on multiple nodes ##\n"
                "Node: {node_id} of {node_num}\n"
                "Number pathways per node: {num_pathways_per_node} of {num_pathways}\n"
                "Start id: {start_id}\n"
                "End id: {end_id}\n"
                "Dataframe size: {df_size}".format(
                    node_id=self.multi_node_node_id + 1,
                    node_num=self.multi_node_num_nodes,
                    num_pathways_per_node=pathways_per_node,
                    num_pathways=num_pathways,
                    start_id=start_id,
                    end_id=end_id,
                    df_size=self.matrix_data.shape))
Example #15
0
    def read_file(self):
        rf_start_time = time.time()

        if os.path.isfile(self.input_file_path):
            if self.test_run:
                self.input_data = pd.read_csv(self.input_file_path,
                                              sep="\t",
                                              index_col=0,
                                              nrows=150)
                self.input_data = self.input_data.iloc[:150, :100]
            else:
                n_rows = None
                if self.n_rows != None and self.n_rows != '':
                    n_rows = int(self.n_rows)
                self.input_data = read_pd_df(self.input_file_path, {
                    "sep": "\t",
                    "index_col": 0,
                    "nrows": n_rows
                },
                                             force=self.force)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.input_file_path))

        if self.n_components is None:
            self.n_components = np.min(self.input_data.shape)

        logging_print(
            stats_dict_to_string({
                "Input dataframe n_row":
                self.input_data.shape[0],
                "Input dataframe n_col":
                self.input_data.shape[1],
                "first column headers":
                self.input_data.columns.values[:5],
                "first row index":
                self.input_data.index.values[:5]
            }))
        timer_print(rf_start_time,
                    prefix="Reading input file ready",
                    time_overview_log=self.process_time_overview)
    def fit_auto_white(self, data):
        # Method to fiy the FastICA model without the manual whitening
        logging_print("Use fastICA with auto whiten")

        # Use the sklearn implementation to perform FastICA inclusive
        # the whitening step
        fastICA_object = FastICA(n_components=self.n_components,
                                 algorithm="parallel",
                                 fun='logcosh',
                                 max_iter=500,
                                 tol=1e-10)
        # Fit the model
        sources = fastICA_object.fit_transform(data)

        # Save the data
        self.projected_data = sources
        self.components = fastICA_object.components_

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
Example #17
0
    def calculate_auc_values(self):
        cav_st_stime = time.time()
        auc_values = {}
        for pathway_id in self.pathway_gene_scores.columns:
            if pathway_id in list(self.matrix_data.columns):
                pathway_gene_set = self.matrix_data.loc[:, pathway_id]
                pathway_gene_scores = self.pathway_gene_scores.loc[
                    self.matrix_data.index, pathway_id]
                try:
                    log_reg_fpr, log_reg_tpr, _ = roc_curve(
                        pathway_gene_set * 1, pathway_gene_scores)
                    auc_values[pathway_id] = auc(log_reg_fpr, log_reg_tpr)
                except ValueError:
                    logging_print(
                        "Value error in auc calculation of pathway: {}".format(
                            pathway_id))
            else:
                logging_print("HPO term: {} not in matrix".format(pathway_id))

        auc_calc_values = pd.Series(auc_values)
        auc_calc_values[auc_calc_values > 1.0] = 1.0
        self.auc_values = auc_calc_values
        timer_print(cav_st_stime, prefix="AUC value calculation ready")
Example #18
0
    def fit(self, data):
        if self.whiten_components is None:
            self.perform_data_whitening(data)
        fit_start_time = time.time()

        fastICA_object = FastICA(algorithm="parallel",
                                 whiten=False,
                                 fun='logcosh',
                                 max_iter=self.max_iter,
                                 tol=1e-10)

        fastICA_object.fit(self.whiten_data[:, :self.n_components])

        indep_comp = np.dot(fastICA_object.components_,
                            self.whiten_components[:self.n_components, :])
        indep_sources = np.dot(indep_comp, data.to_numpy().T).T

        components_index = pd.RangeIndex(start=1,
                                         stop=self.n_components + 1,
                                         name="IC")

        indep_sources_df = pd.DataFrame(
            indep_sources, index=data.index,
            columns=components_index).add_prefix("IC_")

        indep_comp_df = pd.DataFrame(
            indep_comp, index=components_index,
            columns=data.columns).T.add_prefix("IC_").T

        self.projected_data = indep_sources_df
        self.components = indep_comp_df

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
        timer_print(fit_start_time,
                    prefix="FastICA component optimalisation is ready")
Example #19
0
    def perform_data_preprocessing(self):
        pp_start_time = time.time()
        if self.perform_log2:
            self.input_data = np.log2(self.input_data,
                                      where=(self.input_data != 0.0))
            logging_print(
                "Log 2 transformation is performed on the input data")

        if self.pre_processing_center_scale:
            scaler = StandardScaler()
            if self.over_samples:
                scaled_np_data = scaler.fit_transform(self.input_data)
            else:
                scaled_np_data = scaler.fit_transform(self.input_data.T).T

            self.input_data = pd.DataFrame(scaled_np_data,
                                           index=self.input_data.index,
                                           columns=self.input_data.columns)
            logging_print(
                "Centering and scaling is performed on the input data")

        timer_print(pp_start_time,
                    prefix="Data pre-processing ready",
                    time_overview_log=self.process_time_overview)
Example #20
0
    def read_files(self):
        rf_start_time = time.time()

        # read input component file
        if os.path.isfile(self.components_data_path):
            self.components_data = read_pd_df(self.components_data_path, {
                "sep": "\t",
                "index_col": 0
            },
                                              force=self.force)
            components_data_info = "Components dataframe n_row: {}, " \
                             "n_col: {}\n" \
                             "first column headers: {}\n" \
                             "first row index: {}".format(
                *self.components_data.shape,
                ", ".join(
                    self.components_data.columns.values[
                    :5]),
                ", ".join(
                    self.components_data.index.values[
                    :5]),
                )
            logging_print(components_data_info)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.components_data_path))

        # read matrix file
        self.matrix_data = read_pd_df(self.matrix_path, {
            "sep": "\t",
            "index_col": 0
        },
                                      force=self.force,
                                      proc_df_before_save=lambda df: df == 1.0)

        matrix_data_info = "Matrix dataframe n_row: {}, " \
                         "n_col: {}\n" \
                         "first column headers: {}\n" \
                         "first row index: {}".format(
            *self.matrix_data.shape,
            ", ".join(
                self.matrix_data.columns.values[
                :5]),
            ", ".join(
                self.matrix_data.index.values[
                :5]),
            )
        print(matrix_data_info)
        logging.info(matrix_data_info)

        # reading background gene path
        if self.background_genes_path is not None and self.background_genes_path != '':
            self.background_genes_data = read_pd_df(self.background_genes_path,
                                                    {
                                                        "sep": "\t",
                                                        "header": None
                                                    },
                                                    force=self.force)

            # convert to serie instead of matrix
            self.background_genes_data = self.background_genes_data.iloc[:, 0]

            background_genes_data_info = "Background genes file loaded:\n" \
                               "number of genes: {}, " \
                               "first genes: {}\n".format(
                self.background_genes_data.shape[0],
                ", ".join(
                    self.background_genes_data.values[
                    :5])
            )
            logging_print(background_genes_data_info)

        timer_print(rf_start_time, prefix="Reading input file ready")
Example #21
0
    def perform_analysis(self):
        pa_start_time = time.time()
        total_z_score_results = []

        # check if some temp files where present
        temp_z_score_paths = glob.glob(
            os.path.join(self.output_dir,
                         "temp_results_analysis_z_scores_*.pkl"))
        temp_z_scores = None
        temp_already_processed_pathways = None
        if len(temp_z_score_paths) > 0:
            temp_z_scores = pd.read_pickle(temp_z_score_paths[0])
            logging_print(
                "Temp file '{}' with already processed pathways loaded. size df: {}"
                .format(temp_z_score_paths[0], temp_z_scores.shape))
            temp_already_processed_pathways = list(
                temp_z_scores.columns.to_numpy())

        pathway_manager = mp.Manager()
        pathway_queue = pathway_manager.Queue()

        retults_manager = mp.Manager()
        results_queue = retults_manager.Queue()

        for index, row in self.matrix_data.iteritems():
            if temp_z_scores is not None:
                if index in temp_already_processed_pathways:
                    total_z_score_results.append(temp_z_scores.loc[:, index])
                else:
                    pathway_queue.put(index)
            else:
                pathway_queue.put(index)

        logging_print("total pathways already done: {}".format(
            len(total_z_score_results)))

        n_workers = self.n_cores - 1
        processes = []
        for _ in range(n_workers):
            processes.append(
                mp.Process(target=single_pathway_worker,
                           args=(self.components_data, self.matrix_data,
                                 pathway_queue, results_queue,
                                 self.background_genes_data,
                                 self.analysis_type, -1,
                                 self.components_data_permutated)))

        for process in processes:
            process.start()

        total_done = 0
        last_save_time = time.time()
        save_time_in_minutes = 2
        z_score_file_path = None
        while True:
            try:
                if total_done >= n_workers:
                    # All workers are ready
                    break
                if results_queue.empty():
                    # Wait for the next results
                    time.sleep(5)
                else:
                    # Process the results
                    results = results_queue.get(True, timeout=1)
                    if isinstance(results, str) and results == "DONE":
                        # One worker is done
                        total_done += 1
                        logging_print("total workers done {} of {}".format(
                            total_done, n_workers))
                    else:
                        # Save the results
                        total_z_score_results.append(results)

                    # save temp results
                    if last_save_time + 60 * save_time_in_minutes < time.time(
                    ):
                        logging_print("save temp results: {}".format(
                            datetime.now()))
                        last_save_time = time.time()
                        temp_dataframe_z_scores = pd.DataFrame(
                            total_z_score_results).T
                        new_z_score_file_path = os.path.join(
                            self.output_dir,
                            "temp_results_analysis_z_scores_{}.pkl".format(
                                datetime.now()))
                        temp_dataframe_z_scores.to_pickle(
                            new_z_score_file_path)

                        if z_score_file_path is not None and os.path.isfile(
                                z_score_file_path):
                            os.remove(z_score_file_path)
                        z_score_file_path = new_z_score_file_path

            except queue.Empty:
                time.sleep(1)
                continue

        for process in processes:
            process.join()

        print("all processes ready")
        self.pathway_gene_scores = pd.DataFrame(total_z_score_results).T

        pathway_gene_scores_temp_file_path = os.path.join(
            self.output_dir,
            "temp_pathway_gene_scores_temp_{}.pkl".format(datetime.now()))
        self.pathway_gene_scores.to_pickle(pathway_gene_scores_temp_file_path)
        timer_print(pa_start_time, prefix="Performing analysis ready")
Example #22
0
    def __init__(self,
                 components_data_path,
                 matrix_path,
                 output_dir,
                 analysis_type,
                 minimal_number_of_genes=None,
                 background_genes_path=None,
                 n_cores=None,
                 split_start=None,
                 split_end=None,
                 multi_node_node_id=None,
                 multi_node_num_nodes=None,
                 multi_node_output_dir=None,
                 force=False,
                 output_disable_txt=False,
                 output_disable_gzip=False,
                 output_disable_pickle=False):
        self.start_time = time.time()
        self.components_data_path = components_data_path
        self.output_dir = output_dir
        self.analysis_type = analysis_type
        self.matrix_path = matrix_path
        self.background_genes_path = background_genes_path

        self.minimal_number_of_genes = -1
        if minimal_number_of_genes is not None and minimal_number_of_genes != "":
            self.minimal_number_of_genes = int(minimal_number_of_genes)
        if self.minimal_number_of_genes < 3:
            self.minimal_number_of_genes = 3
            logging_print(
                "Minimal number of genes must be 3 of higher, so the value is set to 3"
            )

        # for selecting and multiprocessing
        self.split_start = split_start
        self.split_end = split_end
        self.multi_node_node_id = multi_node_node_id
        self.multi_node_num_nodes = multi_node_num_nodes
        self.multi_node_output_dir = multi_node_output_dir

        self.n_cores = mp.cpu_count()
        if n_cores is not None and n_cores != "":
            self.n_cores = int(n_cores)

        self.components_data = None
        self.background_genes_data = None
        self.matrix_data = None
        self.components_data_permutated = None

        # results
        self.gene_pathway_count = None
        self.pathway_gene_scores = None
        self.auc_values = None
        self.p_values = None
        self.bonf_p_values = None
        self.reject = None
        self.force = force
        self.method_stats = {}

        # output
        self.output_disable_txt = output_disable_txt
        self.output_disable_gzip = output_disable_gzip
        self.output_disable_pickle = output_disable_pickle
Example #23
0
    def merge_output_files(self):
        mof_start_time = time.time()
        if self.multi_node_output_dir:
            node_output_auc_pred_files = glob.glob(
                os.path.join(self.multi_node_output_dir, "*",
                             "predictions_auc.pkl"))
            if len(node_output_auc_pred_files) == self.multi_node_num_nodes:
                logging_print("Merge output files")

                # merge AUC prediction files
                comp_df_auc_list = []
                for node_output_auc_file_path in node_output_auc_pred_files:
                    file_auc_df = pd.read_pickle(node_output_auc_file_path)
                    comp_df_auc_list.append(file_auc_df)

                overall_auc_df = pd.concat(comp_df_auc_list)

                bonf_values = overall_auc_df["pValue"] * overall_auc_df.shape[0]
                bonf_values[bonf_values > 1] = 1
                bonf_values[bonf_values < 0] = 0
                overall_auc_df["bonferroni"] = bonf_values

                if self.output_disable_txt == False:
                    output_file_path_csv = os.path.join(
                        self.multi_node_output_dir,
                        "predictions_auc_bonf.txt.gz")
                    if self.output_disable_gzip:
                        output_file_path_csv = os.path.join(
                            self.multi_node_output_dir,
                            "predictions_auc_bonf.txt")
                    overall_auc_df.to_csv(
                        output_file_path_csv,
                        columns=["geneCount", "pValue", "auc", "bonferroni"],
                        sep='\t')

                if self.output_disable_pickle == False:
                    overall_auc_df.to_pickle(
                        os.path.join(self.multi_node_output_dir,
                                     "predictions_auc_bonf.pkl"))

                # merge gene pathway files
                node_output_gene_pathway_pred_files = glob.glob(
                    os.path.join(self.multi_node_output_dir, "*",
                                 "gene_pathway_scores.pkl"))
                comp_df_gene_pathway_list = []
                for node_output_auc_file_path in node_output_gene_pathway_pred_files:
                    file_gene_pathway_df = pd.read_pickle(
                        node_output_auc_file_path)
                    comp_df_gene_pathway_list.append(file_gene_pathway_df)

                overall_gene_pathway_df = pd.concat(comp_df_gene_pathway_list,
                                                    axis=1)

                if self.output_disable_txt == False:
                    output_gene_pathway_pred_path = os.path.join(
                        self.multi_node_output_dir,
                        "gene_pathway_scores.txt.gz")

                    if self.output_disable_gzip:
                        output_gene_pathway_pred_path = os.path.join(
                            self.multi_node_output_dir,
                            "gene_pathway_scores.txt")
                    overall_gene_pathway_df.to_csv(
                        output_gene_pathway_pred_path, sep='\t')

                if self.output_disable_pickle == False:
                    overall_gene_pathway_df.to_pickle(
                        os.path.join(self.multi_node_output_dir,
                                     "gene_pathway_scores.pkl"))
        timer_print(mof_start_time, prefix="Writing merged outputfile ready")