def run_general_clustering_pipeline(self):
        """
        Runs data cleaning for general_clustering_pipeline.

        Args:
            NA.

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        if self.user_spreadsheet_df is None:
            return False, logger.logging

        # Checks intersection between user spreadsheet data and phenotype data
        phenotype_df_cleaned = None
        if self.phenotype_df is not None:
            phenotype_df_cleaned = CommonUtil.check_phenotype_intersection(self.phenotype_df,
                                                                           self.user_spreadsheet_df.columns.values)
            if phenotype_df_cleaned is None:
                logger.logging.append('ERROR: Phenotype is emtpy. Please provide a valid phenotype data.')
                return False, logger.logging
        logger.logging.append('INFO: Start to process user spreadsheet data.')

        # Checks if user spreadsheet contains na value and only real number
        user_spreadsheet_df_val_check = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df,
                                                                                dropna_colwise=True,
                                                                                check_real_number=True,
                                                                                check_positive_number=True)
        if user_spreadsheet_df_val_check is None:
            return False, logger.logging

        user_spreadsheet_df_rm_na_header = SpreadSheet.remove_na_header(user_spreadsheet_df_val_check)
        if user_spreadsheet_df_rm_na_header is None:
            return False, logger.logging

        # Removes NA value and duplication on column and row name
        user_spreadsheet_df_cleaned = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_df_rm_na_header)
        if user_spreadsheet_df_cleaned is None:
            return False, logger.logging

        IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')
        logger.logging.append(
            'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                user_spreadsheet_df_cleaned.shape[0],
                user_spreadsheet_df_cleaned.shape[1]))

        if phenotype_df_cleaned is not None:
            IOUtil.write_to_file(phenotype_df_cleaned, self.run_parameters['phenotype_name_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv')
            logger.logging.append(
                'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(phenotype_df_cleaned.shape[0],
                                                                                   phenotype_df_cleaned.shape[1]))
        return True, logger.logging
    def run_gene_prioritization_pipeline(self):
        """
        Runs data cleaning for gene_prioritization_pipeline.

        Args:
            NA.

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        # Checks user spreadsheet data and phenotype data
        if self.user_spreadsheet_df is None or self.phenotype_df is None:
            return False, logger.logging

        # Imputes na value on user spreadsheet data
        user_spreadsheet_df_imputed = SpreadSheet.impute_na(self.user_spreadsheet_df,
                                                            option=self.run_parameters['impute'])
        if user_spreadsheet_df_imputed is None:
            return False, logger.logging

        # Checks if value of inputs satisfy certain criteria: see details in function validate_inputs_for_gp_fp
        user_spreadsheet_val_chked, phenotype_val_checked = CommonUtil.validate_inputs_for_gp_fp(
            user_spreadsheet_df_imputed, self.phenotype_df, self.run_parameters['correlation_measure'])
        if user_spreadsheet_val_chked is None or phenotype_val_checked is None:
            return False, logger.logging
        # Removes NA value and duplication on column and row name
        user_spreadsheet_df_checked = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_val_chked)
        # Checks the validity of gene name to see if it can be ensemble or not
        user_spreadsheet_df_cleaned, map_filtered_dedup, mapping = SpreadSheet.map_ensemble_gene_name(
            user_spreadsheet_df_checked,
            self.run_parameters)
        if user_spreadsheet_df_cleaned is None or phenotype_val_checked is None:
            return False, logger.logging
        # Stores cleaned phenotype data (transposed) to a file, dimension: phenotype x sample
        IOUtil.write_to_file(phenotype_val_checked, self.run_parameters['phenotype_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')
        IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')
        # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file
        IOUtil.write_to_file(map_filtered_dedup, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False)

        # writes user supplied gene name along with its mapping status to a file
        IOUtil.write_to_file(mapping, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'],
                             '_User_To_Ensembl.tsv', use_index=False, use_header=True)
        logger.logging.append(
            'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                user_spreadsheet_df_cleaned.shape[0],
                user_spreadsheet_df_cleaned.shape[1]))
        logger.logging.append(
            'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(phenotype_val_checked.shape[0],
                                                                               phenotype_val_checked.shape[1]))
        return True, logger.logging
    def run_geneset_characterization_pipeline(self):
        """
        Runs data cleaning for geneset_characterization_pipeline.

        Args:
            NA

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        if self.user_spreadsheet_df is None:
            return False, logger.logging

        # Checks only non-negative real number appears in user spreadsheet, drop na column wise
        user_spreadsheet_val_chked = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df, check_na=True,
                                                                             check_real_number=True,
                                                                             check_positive_number=True)
        if user_spreadsheet_val_chked is None:
            return False, logger.logging

        # Removes NA value and duplication on column and row name
        user_spreadsheet_df_checked = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_val_chked)

        # Checks the validity of gene name to see if it can be ensemble or not
        user_spreadsheet_df_cleaned, map_filtered_dedup, mapping = SpreadSheet.map_ensemble_gene_name(
            user_spreadsheet_df_checked,
            self.run_parameters)
        if user_spreadsheet_df_cleaned is None:
            return False, logger.logging

        IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')

        # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file
        IOUtil.write_to_file(map_filtered_dedup, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False)

        # writes user supplied gene name along with its mapping status to a file
        IOUtil.write_to_file(mapping, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'],
                             '_User_To_Ensembl.tsv', use_index=False, use_header=True)

        logger.logging.append(
            'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                user_spreadsheet_df_cleaned.shape[0],
                user_spreadsheet_df_cleaned.shape[1]))
        return True, logger.logging
Example #4
0
    def check_phenotype_intersection(phenotype_df, user_spreadsheet_df_header):
        """
        Pre-processing phenotype data. This includes checking for na index, duplicate column name and row name.
        Args:
            phenotype_df: input phenotype dataframe to be checked

        Returns:
            phenotype_df_genename_dedup: cleaned phenotype dataframe
        """
        logger.logging.append("INFO: Start to pre-process phenotype data.")

        phenotype_df_genename_dedup = SpreadSheet.remove_dataframe_indexer_duplication(phenotype_df)
        if phenotype_df_genename_dedup is None:
            return None

        # checks the intersection on phenotype
        intersection = CheckUtil.find_intersection(phenotype_df_genename_dedup.index.values,
                                                   user_spreadsheet_df_header)
        if intersection is None:
            return None

        logger.logging.append(
            "INFO: Found {} intersected gene(s) between phenotype and spreadsheet data.".format(len(intersection)))
        logger.logging.append("INFO: Finished running sanity check for phenotype data.")
        return phenotype_df_genename_dedup
    def run_phenotype_prediction_pipeline(self):
        """
        Runs data cleaning for phenotype_prediction_pipeline.

        Args:
            NA.
        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        # spreadsheet dimension: sample x phenotype, phenotype dimension : sample x phenotype
        if self.user_spreadsheet_df is None or self.phenotype_df is None:
            return False, logger.logging

        # Checks if user spreadsheet contains only real number and drop na column wise
        user_spreadsheet_dropna = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df,
                                                                          dropna_colwise=True,
                                                                          check_real_number=True)

        if user_spreadsheet_dropna is None or user_spreadsheet_dropna.empty:
            logger.logging.append('ERROR: After drop NA, user spreadsheet data becomes empty.')
            return None, None

        # Checks for valid intersection between phenotype data and user spreadsheet data
        dataframe_header = list(user_spreadsheet_dropna.columns.values)
        phenotype_df_pxs_trimmed = CheckUtil.check_intersection_for_phenotype_and_user_spreadsheet(dataframe_header,
                                                                                                   self.phenotype_df)

        # Removes NA value and duplication on column and row name
        user_spreadsheet_df_cleaned = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_dropna)
        if user_spreadsheet_df_cleaned is None or phenotype_df_pxs_trimmed is None:
            return False, logger.logging

        # Stores cleaned phenotype data (transposed) to a file, dimension: phenotype x sample
        IOUtil.write_to_file(phenotype_df_pxs_trimmed, self.run_parameters['phenotype_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')
        IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')

        logger.logging.append(
            'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                user_spreadsheet_df_cleaned.shape[0],
                user_spreadsheet_df_cleaned.shape[1]))
        logger.logging.append(
            'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(phenotype_df_pxs_trimmed.shape[0],
                                                                               phenotype_df_pxs_trimmed.shape[1]))
        return True, logger.logging
    def run_simplified_inpherno_pipeline(self):
        """
        Runs data cleaning for simplified_inpherno_pipeline.

        Args:
            NA.

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        output_files = ['Pvalue_gene_phenotype', 'expression_sample', 'TFexpression']

        for file in output_files:
            if eval(str('self.' + file)) is None:
                return False, logger.logging

        for file in output_files:
            cur_data = eval(str('self.' + file))
            if SpreadSheet.check_user_spreadsheet_data(cur_data, check_real_number=True,
                                                       check_na=True if file is 'TFexpression' else False) is None:
                return False, logger.logging

            cur_data_cleaned, mapping_dedup, mapping = SpreadSheet.map_ensemble_gene_name(cur_data, self.run_parameters)

            if cur_data_cleaned is None:
                return False, logger.logging

            IOUtil.write_to_file(cur_data, self.run_parameters[file + '_full_path'],
                                 self.run_parameters['results_directory'], '.tsv',
                                 use_header=False if file is 'TFexpression' else True)

            IOUtil.write_to_file(cur_data_cleaned, self.run_parameters[file + '_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv',
                                 use_header=False if file is 'TFexpression' else True)

            # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file
            IOUtil.write_to_file(mapping_dedup, self.run_parameters[file + '_full_path'],
                                 self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False)

            # writes user supplied gene name along with its mapping status to a file
            IOUtil.write_to_file(mapping, self.run_parameters[file + '_full_path'],
                                 self.run_parameters['results_directory'],
                                 '_User_To_Ensembl.tsv', use_index=False, use_header=True)

        return True, logger.logging
    def run_feature_prioritization_pipeline(self):
        """
        Run data cleaning for feature prioritization pipeline.

        Args:
            NA.

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        from knpackage.toolbox import get_spreadsheet_df

        if self.user_spreadsheet_df is None or self.phenotype_df is None:
            return False, logger.logging

        # Imputes na value on user spreadsheet data
        user_spreadsheet_df_imputed = SpreadSheet.impute_na(self.user_spreadsheet_df,
                                                            option=self.run_parameters['impute'])
        if user_spreadsheet_df_imputed is None:
            return False, logger.logging

        # Checks if value of inputs satisfy certain criteria
        user_spreadsheet_val_chked, phenotype_val_chked = CommonUtil.validate_inputs_for_gp_fp(
            user_spreadsheet_df_imputed,
            self.phenotype_df, self.run_parameters[
                'correlation_measure'])
        if user_spreadsheet_val_chked is None or phenotype_val_chked is None:
            return False, logger.logging

        IOUtil.write_to_file(user_spreadsheet_val_chked, self.run_parameters['spreadsheet_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')
        logger.logging.append(
            'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                user_spreadsheet_val_chked.shape[0],
                user_spreadsheet_val_chked.shape[1]))

        if self.run_parameters['correlation_measure'] == 't_test':
            phenotype_df = get_spreadsheet_df(self.run_parameters['phenotype_name_full_path'])
            phenotype_output = TransformationUtil.phenotype_expander(phenotype_df)
        else:
            phenotype_output = phenotype_val_chked

        IOUtil.write_to_file(phenotype_output, self.run_parameters['phenotype_name_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')
        logger.logging.append(
            'INFO: Cleaned phenotypic data has {} row(s), {} column(s).'.format(phenotype_val_chked.shape[0],
                                                                                phenotype_val_chked.shape[1]))
        return True, logger.logging
Example #8
0
    def load_data_file_wo_empty_line(file_path):
        """
        Loads data file as a DataFrame object and removes empty line by a given file path. 

        Args:
            file_path: input file, which is uploaded from frontend

        Returns:
            input_df_wo_empty_ln: user input as a DataFrame, which doesn't have empty line
        """
        input_df = IOUtil.load_data_file_default(file_path)

        if input_df is None:
            return None

        # removes rows with 'NA' values (which is a valid value in Gene name)
        input_df_wo_empty_ln = SpreadSheet.remove_empty_row(input_df)
        if input_df_wo_empty_ln is None or input_df_wo_empty_ln.empty:
            logger.logging.append(
                'ERROR: Input data {} becomes empty after removing empty row. Please provide a valid input data.'
                .format(file_path))
            return None
        return input_df_wo_empty_ln
    def run_pasted_gene_set_conversion(self):
        """
        Runs data cleaning for pasted_gene_set_conversion.

        Args:
            NA.

        Returns:
             validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.

        """
        from utils.redis_util import RedisUtil

        # Gets redis database instance by its credential
        redis_db = RedisUtil(self.run_parameters['redis_credential'],
                             self.run_parameters['source_hint'],
                             self.run_parameters['taxonid'])

        # Reads pasted_gene_list as a dataframe
        if self.pasted_gene_df is None:
            return False, logger.logging
        logger.logging.append('INFO: Successfully load spreadsheet data: {} with {} gene(s).'.format(
            self.run_parameters['pasted_gene_list_full_path'], self.pasted_gene_df.shape[0]))

        # Removes nan index rows
        input_small_genes_df = SpreadSheet.remove_na_index(self.pasted_gene_df)

        # casting index to String type
        input_small_genes_df.index = input_small_genes_df.index.map(str)

        if input_small_genes_df is None or len(input_small_genes_df.index) == 0:
            logger.logging.append('ERROR: Input data is empty. Please upload valid input data.')
            return False, logger.logging

        input_small_genes_df['user_supplied_gene_name'] = input_small_genes_df.index

        # Converts pasted_gene_list to ensemble name
        redis_ret = redis_db.get_node_info(input_small_genes_df.index, 'Gene')
        ensemble_names = [x[1] for x in redis_ret]
        input_small_genes_df.index = pandas.Series(ensemble_names)

        # Filters out the unmapped genes
        mapped_small_genes_df = input_small_genes_df[~input_small_genes_df.index.str.contains(r'^unmapped.*$')]

        # Filters the duplicate gene name and write them along with their corresponding user supplied gene name to a file
        mapped_small_genes_df[(~mapped_small_genes_df.index.str.contains(
            r'^unmapped.*$') & mapped_small_genes_df.index.duplicated())][
            'user_supplied_gene_name'] = 'duplicate ensembl name'

        input_small_genes_df['status'] = input_small_genes_df.index

        IOUtil.write_to_file(input_small_genes_df, self.run_parameters['pasted_gene_list_full_path'],
                             self.run_parameters['results_directory'], '_User_To_Ensembl.tsv', use_index=False,
                             use_header=True)

        # Reads the univeral_gene_list
        universal_genes_df = IOUtil.load_data_file_default(self.run_parameters['temp_redis_vector'])
        if universal_genes_df is None:
            return False, logger.logging

        # Inserts a column with value 0
        universal_genes_df.insert(0, 'value', 0)
        # Finds the intersection between pasted_gene_list and universal_gene_list
        common_idx = universal_genes_df.index.intersection(mapped_small_genes_df.index)
        logger.logging.append(
            'INFO: Found {} common gene(s) that shared between pasted gene list and universal gene list.'.format(
                len(common_idx)))
        # inserts a column with value 1
        universal_genes_df.loc[common_idx] = 1
        # names the column of universal_genes_df to be 'uploaded_gene_set'
        universal_genes_df.columns = ['uploaded_gene_set']
        del universal_genes_df.index.name

        # outputs final results
        IOUtil.write_to_file(mapped_small_genes_df, self.run_parameters['pasted_gene_list_full_path'],
                             self.run_parameters['results_directory'], '_MAP.tsv')
        IOUtil.write_to_file(universal_genes_df, self.run_parameters['pasted_gene_list_full_path'],
                             self.run_parameters['results_directory'], '_ETL.tsv')

        logger.logging.append('INFO: Universal gene list contains {} genes.'.format(universal_genes_df.shape[0]))
        logger.logging.append('INFO: Mapped gene list contains {} genes.'.format(mapped_small_genes_df.shape[0]))
        return True, logger.logging
 def test_check_duplicate_row_name_without_dup(self):
     ret_df = SpreadSheet.remove_duplicate_row_name(self.input_df_nodup)
     ret_flag = ret_df is not None
     self.assertEqual(True, ret_flag)
     npytest.assert_array_equal(self.golden_output_nodup, ret_df)
 def test_impute_na_average(self):
     ret = SpreadSheet.impute_na(self.input_df, "average")
     npytest.assert_array_equal(self.golden_output_average, ret)
Example #12
0
    def test_remove_na_header_failure(self):
        ret_df = SpreadSheet.remove_na_header(self.input_df_fail)
        ret_flag = ret_df is not None

        self.assertEqual(False, ret_flag)
Example #13
0
 def test_remove_na_header_nan_header(self):
     ret_df = SpreadSheet.remove_na_header(self.input_df_nan)
     ret_flag = ret_df is not None
     self.assertEqual(True, ret_flag)
Example #14
0
 def test_remove_na_index_fail_na(self):
     ret_df = SpreadSheet.remove_na_index(self.input_df_fail_na)
     ret_flag = ret_df is not None
     self.assertEqual(True, ret_flag)
 def test_map_ensemble_gene_name_empty_mapped(self):
     ret_df_mapped_dedup, map_filtered_dedup, mapping = SpreadSheet.map_ensemble_gene_name(self.input_df_cannot_map,
                                                                                           self.run_parameters)
     ret_val_boolean = True if ret_df_mapped_dedup is not None else False
     self.assertEqual(False, ret_val_boolean)
 def test_impute_na_remove(self):
     ret = SpreadSheet.impute_na(self.input_df, "remove")
     npytest.assert_array_equal(self.golden_output_remove, ret)
    def run_signature_analysis_pipeline(self):
        """
        Runs data cleaning for signature_analysis_pipeline.

        Args:
             NA.

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        if self.signature_df is None or self.user_spreadsheet_df is None:
            return False, logger.logging

        # Removes NA index for both signature data and user spreadsheet data
        signature_df = SpreadSheet.remove_na_index(self.signature_df)
        user_spreadsheet_df = SpreadSheet.remove_na_index(self.user_spreadsheet_df)

        # Checks if only real number and non-NA value appear in user spreadsheet
        if SpreadSheet.check_user_spreadsheet_data(user_spreadsheet_df, check_na=True,
                                                   check_real_number=True,
                                                   check_positive_number=False) is None:
            return False, logger.logging

        # Checks duplicate columns and rows in user spreadsheet data
        if CheckUtil.check_duplicates(user_spreadsheet_df, check_column=True, check_row=True):
            logger.logging.append('ERROR: Found duplicates on user spreadsheet data. Rejecting...')
            return False, logger.logging

        # Checks intersection of genes between signature data and user spreadsheet data
        intersection = CheckUtil.find_intersection(signature_df.index, user_spreadsheet_df.index)
        if intersection is None:
            logger.logging.append('ERROR: Cannot find intersection between spreadsheet genes and signature genes.')
            return False, logger.logging
        logger.logging.append(
            'INFO: Found {} intersected gene(s) between phenotype and spreadsheet data.'.format(len(intersection)))

        # Checks number of unique value in userspread sheet equals to 2
        if not SpreadSheet.check_unique_values(user_spreadsheet_df, cnt=2):
            logger.logging.append(
                'ERROR: user spreadsheet data does not meet the requirment of having at least two unique values.')
            return False, logger.logging

        # Checks intersection among network data, signature data and user spreadsheet data
        if 'gg_network_name_full_path' in self.run_parameters.keys() and \
                not CommonUtil.check_network_data_intersection(intersection,
                                                               self.run_parameters):
            return False, logger.logging

        # The logic here ensures that even if phenotype data doesn't fits requirement, the rest pipelines can still run.
        if user_spreadsheet_df is None:
            return False, logger.logging
        else:
            IOUtil.write_to_file(user_spreadsheet_df,
                                 self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv')
            logger.logging.append(
                'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                    user_spreadsheet_df.shape[0],
                    user_spreadsheet_df.shape[1]))

        if signature_df is not None:
            IOUtil.write_to_file(signature_df, self.run_parameters['signature_name_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv')
            logger.logging.append(
                'INFO: Cleaned phenotype data has {} row(s), {} column(s).'.format(signature_df.shape[0],
                                                                                   signature_df.shape[1]))
        return True, logger.logging
 def test_impute_na_bad_option(self):
     ret = SpreadSheet.impute_na(self.input_df, "bad")
     npytest.assert_array_equal(self.input_df, ret)
    def run_samples_clustering_pipeline(self):
        """
        Runs data cleaning for samples_clustering_pipeline.

        Args:
            NA

        Returns:
            validation_flag: Boolean type value indicating if input data is valid or not.
            message: A message indicates the status of current check.
        """
        if self.user_spreadsheet_df is None:
            return False, logger.logging

        logger.logging.append('INFO: Start to process user spreadsheet data.')
        # Checks if only non-negative real number appears in user spreadsheet and drop na column wise
        user_spreadsheet_val_chked = SpreadSheet.check_user_spreadsheet_data(self.user_spreadsheet_df,
                                                                             dropna_colwise=True,
                                                                             check_real_number=True,
                                                                             check_positive_number=True)
        if user_spreadsheet_val_chked is None:
            return False, logger.logging

        # Removes NA value and duplication on column and row name
        user_spreadsheet_df_checked = SpreadSheet.remove_dataframe_indexer_duplication(user_spreadsheet_val_chked)

        # Checks the validity of gene name to see if it can be ensemble or not
        user_spreadsheet_df_cleaned, map_filtered_dedup, mapping = SpreadSheet.map_ensemble_gene_name(
            user_spreadsheet_df_checked,
            self.run_parameters)

        if 'gg_network_name_full_path' in self.run_parameters.keys() and \
                not CommonUtil.check_network_data_intersection(user_spreadsheet_df_cleaned.index,
                                                               self.run_parameters):
            return False, logger.logging

        # The logic here ensures that even if phenotype data doesn't fits requirement, the rest pipelines can still run.
        if user_spreadsheet_df_cleaned is None:
            return False, logger.logging
        else:
            IOUtil.write_to_file(user_spreadsheet_df_cleaned, self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'], '_ETL.tsv')

            # writes dedupped mapping between user_supplied_gene_name and ensemble name to a file
            IOUtil.write_to_file(map_filtered_dedup, self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'], '_MAP.tsv', use_index=True, use_header=False)

            # writes user supplied gene name along with its mapping status to a file
            IOUtil.write_to_file(mapping, self.run_parameters['spreadsheet_name_full_path'],
                                 self.run_parameters['results_directory'],
                                 '_User_To_Ensembl.tsv', use_index=False, use_header=True)
            logger.logging.append(
                'INFO: Cleaned user spreadsheet has {} row(s), {} column(s).'.format(
                    user_spreadsheet_df_cleaned.shape[0],
                    user_spreadsheet_df_cleaned.shape[1]))

        if self.phenotype_df is not None:
            logger.logging.append('INFO: Start to process phenotype data.')
            phenotype_df_cleaned = CommonUtil.check_phenotype_intersection(self.phenotype_df,
                                                                           self.user_spreadsheet_df.columns.values)
            if phenotype_df_cleaned is None:
                logger.logging.append('ERROR: Phenotype is emtpy. Please provide a valid phenotype data.')
                return False, logger.logging
            else:
                IOUtil.write_to_file(phenotype_df_cleaned, self.run_parameters['phenotype_name_full_path'],
                                     self.run_parameters['results_directory'], '_ETL.tsv')
                logger.logging.append('INFO: Cleaned phenotype data has {} row(s), {} '
                                      'column(s).'.format(phenotype_df_cleaned.shape[0], phenotype_df_cleaned.shape[1]))
        return True, logger.logging
Example #20
0
 def test_Remove_dataframe_indexer_duplication(self):
     ret_val = SpreadSheet.remove_dataframe_indexer_duplication(
         self.input_df_good)
     ret_val_boolean = True if ret_val is not None else False
     self.assertEqual(True, ret_val_boolean)