コード例 #1
0
    def run_samples_clustering_pipeline(self):
        """
        Runs data cleaning for samples_clustering_pipeline.

        Args:
            NA

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        if self.user_spreadsheet_df is None:
            return False, logger.logging

        logger.logging.append('INFO: Start to process user spreadsheet data.')
        # Checks if only non-negative real number appears in user spreadsheet and drop na column wise
        user_spreadsheet_val_chked = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df,
                                                                             dropna_colwise=True,
                                                                             check_real_number=True,
                                                                             check_positive_number=True)
        if user_spreadsheet_val_chked is None:
            return False, logger.logging

        # Removes NA value and duplication on column and row name
        user_spreadsheet_df_checked = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_val_chked)

        # Checks the validity of gene name to see if it can be ensemble or not
        user_spreadsheet_df_cleaned, map_filtered_dedup, mapping = SpreadSheet.map_ensemble_gene_name(
            user_spreadsheet_df_checked,
            self.run_parameters)

        if 'gg_network_name_full_path' in self.run_parameters.keys() and \
                not CommonUtil.check_network_data_intersection(user_spreadsheet_df_cleaned.index,
                                                               self.run_parameters):
            return False, logger.logging

        # The logic here ensures that even if phenotype data doesn't fits requirement, the rest pipelines can still run.
        if user_spreadsheet_df_cleaned is None:
            return False, logger.logging
        else:
            IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv')

            # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file
            IOUtil.write_to_file(map_filtered_dedup, self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False)

            # writes user supplied gene name along with its mapping status to a file
            IOUtil.write_to_file(mapping, self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'],
                                 '_User_To_Ensembl.tsv', use_index=False, use_header=True)
            logger.logging.append(
                'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                    user_spreadsheet_df_cleaned.shape[0],
                    user_spreadsheet_df_cleaned.shape[1]))

        if self.phenotype_df is not None:
            logger.logging.append('INFO: Start to process phenotype data.')
            phenotype_df_cleaned = CommonUtil.check_phenotype_intersection(self.phenotype_df,
                                                                           self.user_spreadsheet_df.columns.values)
            if phenotype_df_cleaned is None:
                logger.logging.append('ERROR: Phenotype is emtpy. Please provide a valid phenotype data.')
                return False, logger.logging
            else:
                IOUtil.write_to_file(phenotype_df_cleaned, self.run_parameters['phenotype_name_full_path'],
                                     self.run_parameters['results_directory'], '_ETL.tsv')
                logger.logging.append('INFO: Cleaned phenotype data has {} row(s), {} '
                                      'column(s).'.format(phenotype_df_cleaned.shape[0], phenotype_df_cleaned.shape[1]))
        return True, logger.logging
コード例 #2
0
    def run_signature_analysis_pipeline(self):
        """
        Runs data cleaning for signature_analysis_pipeline.

        Args:
             NA.

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        if self.signature_df is None or self.user_spreadsheet_df is None:
            return False, logger.logging

        # Removes NA index for both signature data and user spreadsheet data
        signature_df = SpreadSheet.remove_na_index(self.signature_df)
        user_spreadsheet_df = SpreadSheet.remove_na_index(self.user_spreadsheet_df)

        # Checks if only real number and non-NA value appear in user spreadsheet
        if SpreadSheet.check_user_spreadsheet_data(user_spreadsheet_df, check_na=True,
                                                   check_real_number=True,
                                                   check_positive_number=False) is None:
            return False, logger.logging

        # Checks duplicate columns and rows in user spreadsheet data
        if CheckUtil.check_duplicates(user_spreadsheet_df, check_column=True, check_row=True):
            logger.logging.append('ERROR: Found duplicates on user spreadsheet data. Rejecting...')
            return False, logger.logging

        # Checks intersection of genes between signature data and user spreadsheet data
        intersection = CheckUtil.find_intersection(signature_df.index, user_spreadsheet_df.index)
        if intersection is None:
            logger.logging.append('ERROR: Cannot find intersection between spreadsheet genes and signature genes.')
            return False, logger.logging
        logger.logging.append(
            'INFO: Found {} intersected gene(s) between phenotype and spreadsheet data.'.format(len(intersection)))

        # Checks number of unique value in userspread sheet equals to 2
        if not SpreadSheet.check_unique_values(user_spreadsheet_df, cnt=2):
            logger.logging.append(
                'ERROR: user spreadsheet data does not meet the requirment of having at least two unique values.')
            return False, logger.logging

        # Checks intersection among network data, signature data and user spreadsheet data
        if 'gg_network_name_full_path' in self.run_parameters.keys() and \
                not CommonUtil.check_network_data_intersection(intersection,
                                                               self.run_parameters):
            return False, logger.logging

        # The logic here ensures that even if phenotype data doesn't fits requirement, the rest pipelines can still run.
        if user_spreadsheet_df is None:
            return False, logger.logging
        else:
            IOUtil.write_to_file(user_spreadsheet_df,
                                 self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv')
            logger.logging.append(
                'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                    user_spreadsheet_df.shape[0],
                    user_spreadsheet_df.shape[1]))

        if signature_df is not None:
            IOUtil.write_to_file(signature_df, self.run_parameters['signature_name_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv')
            logger.logging.append(
                'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(signature_df.shape[0],
                                                                                   signature_df.shape[1]))
        return True, logger.logging