def min_value_for_each_threshold(self, coder_name, column_index):
     data_column_key = ResultsToDataframe.data_column_key(column_index)
     coder_df = PandasMethods.coder_df(self.df, coder_name)
     new_df = pd.DataFrame(columns=self.df.columns)
     for index, threshold in enumerate(ExperimentsUtils.THRESHOLDS):
         new_df.loc[index] = PandasMethods.get_min_row(coder_df, data_column_key, threshold).values
     return new_df
Exemple #2
0
    def create_pdf_pages(self, pdf, dataset_name, filename):
        for self.col_index in self.column_indexes(dataset_name):
            # create panda_utils. Must do it inside this block to prevent issue with many datatypes in a single dataset
            df_M_local_copy_1 = PandasMethods.copy(self.df_M_local)
            df_M_local_copy_2 = PandasMethods.copy(self.df_M_local)
            pd_utils_3_local_1 = PandasUtils(
                dataset_name, filename, df_M_local_copy_1,
                "M")  # local with best LOCAL window
            pd_utils_3_local_2 = PandasUtils(
                dataset_name, filename, df_M_local_copy_2,
                "M")  # local with best GLOBAL window

            mod_pd_utils_3_local_2 = self.set_global_window(pd_utils_3_local_2)
            # TODO: change order to make Relative Difference <= 0
            self.create_pdf_page(pdf, filename, mod_pd_utils_3_local_2,
                                 pd_utils_3_local_1)
    def __init__(self, dataset_name, filename, df, mask_mode, check=True, with_gzip=False):
        assert(mask_mode in ["NM", "M"])

        self.df = PandasMethods.filename_df(df, filename, dataset_name)
        self.mask_mode = mask_mode
        self.data_columns_count = int((len(self.df.columns) - len(PandasUtils.FIXED_ROWS)) / 2)
        if check:
            PandasUtilsCheck(self, with_gzip).check_df(dataset_name)
        self.__calculate_percentage()
    def __calculate_percentage(self):
        for value in range(1, self.data_columns_count + 1):  # [1, ... ]
            data_col_key = ResultsToDataframe.data_column_key(value)
            percentage_col_key = ResultsToDataframe.percentage_column_key(value)
            new_percentage_col_key = 'new_' + percentage_col_key

            base_coder_total = PandasMethods.coder_df(self.df, "CoderBase")[data_col_key].iloc[0]
            self.df[new_percentage_col_key] = 100 * (self.df[data_col_key] / base_coder_total)
            self.__check_difference(percentage_col_key, new_percentage_col_key)

            # rename percentages column
            self.df.rename(columns={new_percentage_col_key: percentage_col_key}, inplace=True)
 def __check_coder_rows_count(self, coder_name):
     rows_count = self.__coder_rows_count(coder_name)
     if coder_name == 'CoderBase':
         assert(rows_count == 1)
     elif coder_name in ['CoderGZIP']:
         assert(rows_count == len(ExperimentsUtils.THRESHOLDS))
     else:
         # the rest of the coders have the same number of rows
         if rows_count != self.NUMBER_OF_COMBINATIONS:
             print(PandasMethods.coder_df(self.df, coder_name))
             print(coder_name)
             print(self.NUMBER_OF_COMBINATIONS)
             print(rows_count)
             assert(rows_count == self.NUMBER_OF_COMBINATIONS)
 def coder_base_df(self):
     return PandasMethods.coder_df(self.df, 'CoderBase').iloc[0]
 def min_value_for_threshold(self, coder_name, column_index, threshold, nth=None):
     assert(threshold in ExperimentsUtils.THRESHOLDS)
     data_column_key = ResultsToDataframe.data_column_key(column_index)
     coder_df = PandasMethods.coder_df(self.df, coder_name) if coder_name is not None else self.df
     min_row = PandasMethods.get_min_row(coder_df, data_column_key, threshold, nth)
     return min_row
 def __coder_rows_count(self, coder_name):
     rows_count, _ = PandasMethods.coder_df(self.df, coder_name).shape
     return rows_count