Beispiel #1
0
 def created_dataset_pdf_file(self):
     self.pdf_name = self.create_pdf_name(self.path, self.dataset_id,
                                          self.dataset_name)
     with PdfPages(self.pdf_name) as pdf:
         self.pd_utils_3_global = PandasUtils(self.dataset_name, 'Global',
                                              self.df_M_global, "M")
         for self.filename_index, self.filename in enumerate(
                 self.dataset_filenames()):
             print("  " + self.filename)
             self.create_pdf_pages(pdf, self.dataset_name, self.filename)
Beispiel #2
0
    def create_pdf_pages(self, pdf, dataset_name, filename):
        for self.col_index in self.column_indexes(dataset_name):
            # create panda_utils. Must do it inside this block to prevent issue with many datatypes in a single dataset
            df_M_local_copy_1 = PandasMethods.copy(self.df_M_local)
            df_M_local_copy_2 = PandasMethods.copy(self.df_M_local)
            pd_utils_3_local_1 = PandasUtils(
                dataset_name, filename, df_M_local_copy_1,
                "M")  # local with best LOCAL window
            pd_utils_3_local_2 = PandasUtils(
                dataset_name, filename, df_M_local_copy_2,
                "M")  # local with best GLOBAL window

            mod_pd_utils_3_local_2 = self.set_global_window(pd_utils_3_local_2)
            # TODO: change order to make Relative Difference <= 0
            self.create_pdf_page(pdf, filename, mod_pd_utils_3_local_2,
                                 pd_utils_3_local_1)
 def __columns_iteration(self):
     self.panda_utils = PandasUtils(self.dataset_name, self.filename,
                                    self.df, ProcessResults.MM, True,
                                    self.with_gzip)
     for self.col_index in range(
             1,
             ExperimentsUtils.get_dataset_data_columns_count(
                 self.dataset_name) + 1):
         # TODO: uncomment to IGNORE SPEED
         # if self.dataset_name == 'NOAA-SPC-wind' and self.col_index == 3:
         #     continue
         if self.__local_or_single_file():
             self.threshold_compare.calculate_matching_thresholds(
                 self.dataset_name, self.filename, self.col_index)
         self.col_name = ExperimentsUtils.COLUMN_INDEXES[self.dataset_name][
             self.col_index - 1]
         self._print(self.col_name)
         self.__column_results_writer_1()
         self.__column_results_writer_2()
Beispiel #4
0
    def set_global_window(self, pd_utils_3_local_2):
        new_df = pd_utils_3_local_2.df

        for coder_name in self.CODERS_ARRAY:
            # print("  " + coder_name)
            for threshold in ExperimentsUtils.THRESHOLDS:
                # print(threshold)
                best_global_window = self.pd_utils_3_global.min_value_for_threshold(
                    coder_name, self.col_index, threshold)['window']
                best_local_window = pd_utils_3_local_2.min_value_for_threshold(
                    coder_name, self.col_index, threshold)['window']
                if best_global_window != best_local_window:
                    # print str(threshold) + " - GLOBAL = " + str(best_global_window) + " - LOCAL = " + str(best_local_window)
                    # remove every threshold value other than the one that uses the best global window
                    index_names = new_df[(new_df['coder'] == coder_name)
                                         & (new_df['threshold'] == threshold) &
                                         (new_df['window'] !=
                                          best_global_window)].index
                    new_df.drop(index_names, inplace=True)
        mod_pd_utils_3_local_2 = PandasUtils(self.dataset_name, self.filename,
                                             new_df, "M", False)
        return mod_pd_utils_3_local_2
Beispiel #5
0
class PDFS3(PDFSCommon):
    SUBPLOT_SPACING_W_H = (0.1, 0.05)
    FIG_SIZE_H_V = (10, 11)
    CODERS_ARRAY = [
        'CoderPCA', 'CoderAPCA', 'CoderCA', 'CoderPWLH', 'CoderPWLHInt',
        'CoderGAMPSLimit', 'CoderFR', 'CoderSF'
    ]
    PLOTS_ARRAY = ['window', 'relative']
    PLOTS_MATRIX = [
        [['CoderPCA', 'window'], ['CoderAPCA', 'window'],
         ['CoderPWLH', 'window'], ['CoderPWLHInt', 'window']],
        None,
        [['CoderPCA', 'relative'], ['CoderAPCA', 'relative'],
         ['CoderPWLH', 'relative'], ['CoderPWLHInt', 'relative']],
        None,
        [['CoderCA', 'window'], ['CoderSF', 'window'], ['CoderFR', 'window'],
         ['CoderGAMPSLimit', 'window']],
        None,
        [['CoderCA', 'relative'], ['CoderSF', 'relative'],
         ['CoderFR', 'relative'], ['CoderGAMPSLimit', 'relative']],
    ]
    HEIGHT_RATIOS = [30, 0, 30, 15, 30, 0, 30]
    PLOT_OPTIONS = {
        'window': {
            'title': 12,
            'labels': [r'$OWS$', r'$LOWS$']
        },
        'relative': {
            'add_data': True,
            'show_xlabel': True
        }
    }

    def __init__(self, path, datasets_names=None):
        assert (len(self.HEIGHT_RATIOS) == len(self.PLOTS_MATRIX))

        self.df_M_local = ResultsToDataframe(ResultsReader(
            'local', "M")).create_full_df()
        self.df_M_global = ResultsToDataframe(ResultsReader(
            'global', "M")).create_full_df()
        self.path = path

        self.dataset_names = datasets_names or ExperimentsUtils.datasets_with_multiple_files(
        )
        self.mode = 'local'

        # iteration variables
        self.dataset_id = None
        self.dataset_name = None
        self.filename = None
        self.pdf_name = None
        self.col_index = None
        self.pd_utils_3_global = None
        self.latex_table_data = {}

    def create_pdfs(self):
        for dataset_id, self.dataset_name in enumerate(self.dataset_names):
            print(self.dataset_name)
            self.dataset_id = dataset_id + 1
            self.created_dataset_pdf_file()

    def created_dataset_pdf_file(self):
        self.pdf_name = self.create_pdf_name(self.path, self.dataset_id,
                                             self.dataset_name)
        with PdfPages(self.pdf_name) as pdf:
            self.pd_utils_3_global = PandasUtils(self.dataset_name, 'Global',
                                                 self.df_M_global, "M")
            for self.filename_index, self.filename in enumerate(
                    self.dataset_filenames()):
                print("  " + self.filename)
                self.create_pdf_pages(pdf, self.dataset_name, self.filename)

    def create_pdf_pages(self, pdf, dataset_name, filename):
        for self.col_index in self.column_indexes(dataset_name):
            # create panda_utils. Must do it inside this block to prevent issue with many datatypes in a single dataset
            df_M_local_copy_1 = PandasMethods.copy(self.df_M_local)
            df_M_local_copy_2 = PandasMethods.copy(self.df_M_local)
            pd_utils_3_local_1 = PandasUtils(
                dataset_name, filename, df_M_local_copy_1,
                "M")  # local with best LOCAL window
            pd_utils_3_local_2 = PandasUtils(
                dataset_name, filename, df_M_local_copy_2,
                "M")  # local with best GLOBAL window

            mod_pd_utils_3_local_2 = self.set_global_window(pd_utils_3_local_2)
            # TODO: change order to make Relative Difference <= 0
            self.create_pdf_page(pdf, filename, mod_pd_utils_3_local_2,
                                 pd_utils_3_local_1)

    #
    # In the local results, consider the best global window instead of the best local window
    #
    def set_global_window(self, pd_utils_3_local_2):
        new_df = pd_utils_3_local_2.df

        for coder_name in self.CODERS_ARRAY:
            # print("  " + coder_name)
            for threshold in ExperimentsUtils.THRESHOLDS:
                # print(threshold)
                best_global_window = self.pd_utils_3_global.min_value_for_threshold(
                    coder_name, self.col_index, threshold)['window']
                best_local_window = pd_utils_3_local_2.min_value_for_threshold(
                    coder_name, self.col_index, threshold)['window']
                if best_global_window != best_local_window:
                    # print str(threshold) + " - GLOBAL = " + str(best_global_window) + " - LOCAL = " + str(best_local_window)
                    # remove every threshold value other than the one that uses the best global window
                    index_names = new_df[(new_df['coder'] == coder_name)
                                         & (new_df['threshold'] == threshold) &
                                         (new_df['window'] !=
                                          best_global_window)].index
                    new_df.drop(index_names, inplace=True)
        mod_pd_utils_3_local_2 = PandasUtils(self.dataset_name, self.filename,
                                             new_df, "M", False)
        return mod_pd_utils_3_local_2

    def create_pdf_page(self, pdf, filename, pd_utils_3_local_1,
                        pd_utils_3_local_2):
        pdf_page = PdfPage(pd_utils_3_local_1, pd_utils_3_local_2, filename,
                           self)

        # IMPORTANT: resize before setting the labels to avoid this issue: https://stackoverflow.com/q/50395392/4547232
        pdf_page.plt.subplots_adjust(wspace=PDFS3.SUBPLOT_SPACING_W_H[0],
                                     hspace=PDFS3.SUBPLOT_SPACING_W_H[1])

        fig, plt = pdf_page.create(self.CODERS_ARRAY, self.PLOTS_ARRAY,
                                   self.PLOTS_MATRIX)
        pdf.savefig(fig)
        plt.savefig(self.create_image_name_(), format='pdf')
        plt.close()

    def create_image_name_(self):
        filename = self.pdf_name.replace(
            ".pdf", "-") + str(self.filename_index + 1) + "-" + str(
                self.col_index) + ".pdf"
        filename = filename.replace("PDF-", "")
        return filename

    ####################################################################################################################
    ####################################################################################################################
    ####################################################################################################################

    def add_data(self, plot_name, algorithm, values):
        if not self.PLOT_OPTIONS[plot_name].get('add_data'):
            return {}

        minimum, maximum = min(values), max(values)
        assert (minimum >= 0)
        # (1) Check that the maximum does not change and occurs in the expected dataset/coder
        expected_maximum = 10.598254581045069

        result = {}
        if self.dataset_name == "IRKIS" and algorithm == "CoderPCA" and self.filename == "vwc_1203.dat.csv":
            assert (maximum == expected_maximum)
            assert (str(round(maximum, 1)) == "10.6")
            result = {
                'keys': ["PlotMax"],
                'indexes': [values.index(maximum)],
                'color': PlotConstants.VALUE0_COLOR
            }
        else:
            if self.dataset_name == "IRKIS" and algorithm == "CoderAPCA" and self.filename == "vwc_1202.dat.csv":
                # checks that the results specified in the report (Section 4.3) do not change...
                value_for_e_3 = values[2]
                value_for_e_10 = values[4]
                assert (str(round(value_for_e_3, 2)) == "1.52")
                assert (str(round(value_for_e_10, 2)) == "1.76")
                assert (maximum < expected_maximum)
            # print("------")
            # print(maximum)
            # print(expected_maximum)
            # print(self.dataset_name)
            # print(algorithm)
            # print(self.filename)
            assert (maximum < expected_maximum)

        # (2) Add information to the latex table structure
        if not self.latex_table_data.get(algorithm):
            self.latex_table_data[algorithm] = []
        self.latex_table_data[algorithm].append({'values': values})

        return result

    def create_latex_table(self, path):
        algorithms_data = {}
        total = [0, 0, 0, 0, 0]
        for algorithm, array in self.latex_table_data.items():
            range_1, range_2, range_3, range_4, range_5 = 0, 0, 0, 0, 0
            for dictionary in array:
                for value in dictionary['values']:
                    if value == 0:
                        range_1 += 1
                        total[0] += 1
                    elif 0 < value <= 1:
                        range_2 += 1
                        total[1] += 1
                    elif 1 < value <= 2:
                        range_3 += 1
                        total[2] += 1
                    elif 2 < value <= 5:
                        range_4 += 1
                        total[3] += 1
                    elif 5 < value <= 11:
                        range_5 += 1
                        total[4] += 1
                algorithms_data[algorithm] = [
                    range_1, range_2, range_3, range_4, range_5
                ]
        algorithms_data['Total'] = total
        TableWindows(algorithms_data, path).create_table()
Beispiel #6
0
    def create_pdf_pages(self, pdf, dataset_name, filename):
        panda_utils_NM = PandasUtils(dataset_name, filename, self.df_NM, "NM")
        panda_utils_M = PandasUtils(dataset_name, filename, self.df_M, "M")

        for self.col_index in self.column_indexes(dataset_name):
            self.create_pdf_page(pdf, filename, panda_utils_NM, panda_utils_M)
class ProcessResults(object):
    CODERS = [
        'CoderBase',
        'CoderPCA',
        'CoderAPCA',
        'CoderCA',
        'CoderPWLH',
        'CoderPWLHInt',
        'CoderFR',
        'CoderSF',
        # 'CoderGAMPS', => ignore this coder
        'CoderGAMPSLimit'
    ]
    CODERS_WITHOUT_WINDOW = ['CoderBase', 'CoderSF', 'CoderGZIP']
    MM = "M"  # MASK MODE
    DEBUG_MODE = False

    #
    # mode=1  => best algorithm, considering every algorithm
    # mode=2  => best algorithm, considering every algorithm + gzip
    #
    def __init__(self,
                 global_mode,
                 path,
                 mode,
                 gzip_path=None,
                 gzip_filename=None):
        # set script settings
        self.global_mode = global_mode
        self.path = path
        self.mode = mode
        self.with_gzip = mode == 2

        # set other instances
        self.key = 'global' if self.global_mode else 'local'
        self.results_reader = ResultsReader(self.key, ProcessResults.MM)
        self.df = self.__set_df(gzip_path, gzip_filename)
        self.threshold_compare = ThresholdCompare(
            ResultsReader('local', ProcessResults.MM))

    def run(self):
        self.__write_headers()
        self.__datasets_iteration()
        self.latex_table.print_end()
        self.csv_writer_1.show_data()

    def __set_df(self, gzip_path, gzip_filename):
        if not self.with_gzip:
            return ResultsToDataframe(self.results_reader).create_full_df()

        assert (self.key == 'global')
        assert gzip_path
        assert gzip_filename
        gzip_results_reader = GzipResultsReader(gzip_path, gzip_filename)
        return ResultsToDataframe(
            self.results_reader).create_full_df(gzip_results_reader)

    def __write_headers(self):
        extra_str = 'global' if self.global_mode else 'local'
        self.csv_writer_1 = Writer1(self.path, extra_str, self.mode)
        self.csv_writer_2 = Writer2(self.path, extra_str)
        self.latex_table = TableCompression(self.path, self.mode)

    def __datasets_iteration(self):
        for dataset_id, self.dataset_name in enumerate(
                ExperimentsUtils.DATASET_NAMES):
            print(self.dataset_name)
            self._print(self.dataset_name)
            self.__set_dataset(self.dataset_name)
            self.__filenames_iteration()

    def __filenames_iteration(self):
        dataset_filenames = ProcessResults.dataset_filenames(
            self.dataset_name, self.global_mode)
        for self.filename in dataset_filenames:
            self._print(self.filename)
            self.__set_filename(self.filename)
            self.__columns_iteration()

    def __columns_iteration(self):
        self.panda_utils = PandasUtils(self.dataset_name, self.filename,
                                       self.df, ProcessResults.MM, True,
                                       self.with_gzip)
        for self.col_index in range(
                1,
                ExperimentsUtils.get_dataset_data_columns_count(
                    self.dataset_name) + 1):
            # TODO: uncomment to IGNORE SPEED
            # if self.dataset_name == 'NOAA-SPC-wind' and self.col_index == 3:
            #     continue
            if self.__local_or_single_file():
                self.threshold_compare.calculate_matching_thresholds(
                    self.dataset_name, self.filename, self.col_index)
            self.col_name = ExperimentsUtils.COLUMN_INDEXES[self.dataset_name][
                self.col_index - 1]
            self._print(self.col_name)
            self.__column_results_writer_1()
            self.__column_results_writer_2()

    def __column_results_writer_1(self):
        self.csv_writer_1.write_col_name(self.col_name)
        for self.coder_name in self.__coders_array():
            self._print(self.coder_name)
            self.__coder_results()
        self.csv_writer_1.write_data_rows()

    def __coders_array(self):
        coders = ['CoderGZIP'] if self.with_gzip else []
        coders += self.CODERS
        return coders

    #
    # Get the best Window for each <Coder, Column, Threshold> combination
    #
    def __coder_results(self):
        windows, percentages, total_bits_list = [], [], []
        previous_window, previous_percentage, previous_total_bits = None, None, None
        for threshold in ExperimentsUtils.THRESHOLDS:
            row_df = self.panda_utils.min_value_for_threshold(
                self.coder_name, self.col_index, threshold)
            window, percentage, _, total_bits = ProcessResults.get_values(
                row_df, self.col_index)
            new_window, new_percentage, new_total_bits = window, percentage, total_bits

            if self.__same_result(threshold):
                assert (threshold > 0)
                assert (window == previous_window)
                assert (percentage == previous_percentage)
                assert (total_bits == previous_total_bits)
                # TODO: uncomment to show blank cells for a repeated experiment
                # new_window, new_percentage, new_total_bits = '=', '=', '=
            elif self.coder_name in self.CODERS_WITHOUT_WINDOW:
                new_window = ''  # these coders don't have a window param

            windows.append(new_window)
            percentages.append(new_percentage)
            total_bits_list.append(new_total_bits)
            previous_window, previous_percentage, previous_total_bits = window, percentage, total_bits

        self.csv_writer_1.save_data_row(self.coder_name, windows, percentages,
                                        total_bits_list)

    #
    # Get the best <Coder, Window> combination for each <Column, Threshold> combination
    #
    def __column_results_writer_2(self):
        threshold_results = [None, None, self.col_name]
        for threshold in ExperimentsUtils.THRESHOLDS:
            row_df = self.panda_utils.min_value_for_threshold(
                None, self.col_index, threshold)
            window, percentage, coder_name, _ = ProcessResults.get_values(
                row_df, self.col_index)
            coder_name = coder_name.replace("Coder", "")

            new_coder, new_window, new_percentage = coder_name, window, percentage
            threshold_results += [new_coder, new_window, new_percentage]

        self.csv_writer_2.write_row(threshold_results)
        self.latex_table.set_threshold_results(threshold_results)

    def __set_dataset(self, dataset_name):
        self.csv_writer_1.write_dataset_name(dataset_name)
        self.csv_writer_2.write_dataset_name(dataset_name)
        self.latex_table.set_dataset(dataset_name)

    def __set_filename(self, filename):
        self.csv_writer_1.write_filename(filename)
        self.csv_writer_2.write_filename(filename)
        self.latex_table.set_filename(filename)

    def __local_or_single_file(self):
        condition1 = not self.global_mode
        condition2 = self.global_mode and self.__single_file_dataset()
        return condition1 or condition2

    def __same_result(self, threshold):
        return self.__local_or_single_file(
        ) and self.threshold_compare.matching_threshold(threshold)

    def __single_file_dataset(self):
        return ExperimentsUtils.dataset_csv_files_count(self.dataset_name) == 1

    def _print(self, value):
        if self.DEBUG_MODE:
            print(value)

    @staticmethod
    def get_values(row_df, col_index):
        window = None if pd.isnull(row_df['window']) else int(row_df['window'])
        percentage = ProcessResults.parse_percentage(row_df, col_index)
        total_bits = ProcessResults.parse_total_bits(row_df, col_index)
        coder_name = row_df['coder']
        return window, percentage, coder_name, total_bits

    @staticmethod
    def calculate_relative_diff(row_df_pca, row_df_apca, col_index):
        data_column_key = ResultsToDataframe.data_column_key(col_index)
        size_pca, size_apca = row_df_pca[data_column_key], row_df_apca[
            data_column_key]
        relative_diff = MathUtils.relative_difference(size_pca, size_apca)
        coder_name = 'PCA' if size_pca < size_apca else 'APCA'
        return round(relative_diff, 2), coder_name

    @staticmethod
    def calculate_RD(row_df_best, row_df_compare, col_index):
        data_column_key = ResultsToDataframe.data_column_key(col_index)
        size_best, size_compare = row_df_best[data_column_key], row_df_compare[
            data_column_key]
        relative_diff = MathUtils.relative_difference(size_compare, size_best)
        return round(relative_diff, 2)

    @staticmethod
    def parse_percentage(row_df, col_index):
        percentage_key = ResultsToDataframe.percentage_column_key(col_index)
        percentage = round(row_df[percentage_key] / 100, 2)
        return percentage

    @staticmethod
    def parse_total_bits(row_df, col_index):
        total_bits_key = ResultsToDataframe.data_column_key(col_index)
        total_bits = row_df[total_bits_key]
        return int(total_bits)

    @staticmethod
    def dataset_filenames(dataset_name, global_mode):
        filenames = ExperimentsUtils.dataset_csv_filenames(dataset_name)
        return ['Global'] if global_mode and len(filenames) > 1 else filenames