def test_distance_mtr_control_example(self):
     """
     Check if the class DiffMatrix compute the distance matrix correctly. The code is tested with a series of
     dataset contained in the variable path. For each step print a logging message.
     """
     path = "../resources"
     datasets = self.__load_all_files__(path)
     for ds in datasets:
         logging.info("Next data frame: {}".format(ds))
         current_ds = path + "/" + ds
         logging.info("Getting header and separator")
         try:
             c_sep, has_header = ut.check_sep_n_header(current_ds)
         except Exception as ex:
             logging.ERROR(
                 "Failed to load separator and header. Skipping test for {}"
                 .format(ds))
             pass
         logging.info("{} has separator '{}' and has{}header".format(
             ds, c_sep, " no " if has_header is None else " "))
         logging.info("Loading data frame")
         df = self.__load_df(current_ds,
                             sep=c_sep,
                             first_col_header=has_header)
         logging.info("Done")
         logging.info("Loading distance matrix")
         dm = distance_mtr.DiffMatrix(
             current_ds, sep=c_sep,
             first_col_header=has_header)  # create class
         self.assertIsNotNone(dm,
                              "check that dm is not null (none in python)")
         logging.info("Using a sample's splitting")
         dist_m = dm.split_sides({
             'lhs': [i for i in range(1, df.shape[1])],
             'rhs': [0]
         })  # split dm according to control RHS and LHS
         logging.info("Dataset loaded")
         logging.info("Checking shape")
         self.assertIsNotNone(dist_m,
                              "check if distance matrix is not none")
         self.assertGreater(dist_m.shape[0], 0,
                            "check if row's number is greater than zero")
         self.assertGreater(dist_m.shape[1], 0,
                            "check if col's number is greater than zero")
         max_pairs = int(df.shape[0] * (df.shape[0] - 1) / 2)
         self.assertGreaterEqual(max_pairs, dist_m.shape[0],
                                 "check if there are not too many pairs")
         logging.info("Checking rows values")
         rnd = randint(1, dist_m.shape[0] - 1)
         rand_row = dist_m.loc[rnd]
         self.assertTrue(
             all(isinstance(item, float) for item in rand_row.tolist()),
             "check if each element is a float")
         logging.info("Checking the presence of NaN values")
         self.assertFalse(dist_m.isnull().values.any(),
                          "check if same value is NaN")
         logging.info("All Ok!")
 def extract_sep_n_header(self, c_sep, csv_file, has_header):
     """
     Given a correct path to a CSV file containing the dataset, the separator and the presence or not of the header
     given by the command line arguments, this function will try to infer the separator and/or the presence
     of the header in the dataset if they was not specified in the command line arguments.
     :param c_sep: the separator extracted from the command line argument
     :type c_sep: str
     :param csv_file: a correct path to a CSV file containing a valid dataset
     :type csv_file: str
     :param has_header: indicate the presence or not of a column header in the CSV
     :type has_header: int
     :return: the separator used in the CSV and the value 0 if the CSV has an header, None otherwise
     :rtype: tuple
     """
     if c_sep == '' and has_header is None:
         c_sep, has_header = ut.check_sep_n_header(csv_file)
     elif c_sep != '' and has_header is None:
         has_header = ut.check_sep_n_header(csv_file)[1]
     elif c_sep == '' and has_header is not None:
         c_sep = ut.check_sep_n_header(csv_file)[0]
     return c_sep, has_header
Ejemplo n.º 3
0
def plot():
    """
    Given a set of CSV files produced by the module time_counting_test in the directory resources/test, it use the content of
    the most recent file to produce four plots.
    Three plots are scatterplots that shows the relation between the elapsed times, the dataset's rows number and
    the dataset's attributes number for each dataset. Each plot place two of this attributes on the two axis,
    and use the third one as the point radius, where each point corresponds to a dataset.
    The fourth graph show the increasing of the running time respect the increasing of the RFDs found.
    """
    dirpath = os.path.abspath("../resources/test")
    files = getfiles(dirpath)
    file_path = os.path.join(dirpath, files[0])
    try:
        sep, _ = check_sep_n_header(file_path)
    except TypeError:
        print("Unable to find separator in file ", files[0])
        return

    test_df = pd.read_csv(file_path, sep=sep, decimal=',')
    grouped_df = test_df.groupby(['ds_name']).mean()
    datasets = list(grouped_df.index)

    print(grouped_df)

    attr_param = pd.DataFrame(
        {
            "label": [
                'numero di attributi', 'numero di righe',
                'tempo impiegato in ms'
            ],
            "incr_factor": [1000, 10, 1.5],
            "limits": [(1, 7), (-500, 3000), (-5000, 40000)]
        },
        index=['ds_attr_size', 'ds_len', 'time_elapsed'])

    ds_color = pd.DataFrame(cm.RdYlGn(np.linspace(0, 1, len(grouped_df))),
                            index=list(grouped_df.index))

    combinations = pd.DataFrame(
        [[
            'ds_attr_size', 'ds_len', 'time_elapsed',
            'numero di righe rispetto al numero di attributi'
        ],
         [
             'ds_attr_size', 'time_elapsed', 'ds_len',
             'tempo impiegato rispetto al numero di attributi'
         ],
         [
             'ds_len', 'time_elapsed', 'ds_attr_size',
             'tempo impiegato rispetto al numero di righe'
         ]],
        columns=["x", "y", "shape", "title"])

    for index in range(len(attr_param.index)):
        _, ax = plt.subplots()

        ax.set_facecolor('white')
        plt.grid(color='grey')
        ax.spines['bottom'].set_color('grey')
        ax.spines['top'].set_color('grey')
        ax.spines['right'].set_color('grey')
        ax.spines['left'].set_color('grey')

        comb = combinations.iloc[index]

        plt.xlim(attr_param["limits"][comb['x']])
        plt.xlabel(attr_param["label"][comb['x']])
        plt.ylim(attr_param["limits"][comb['y']])
        plt.ylabel(attr_param["label"][comb['y']])
        plt.title(comb["title"])

        grouped_df = grouped_df.sort_values(by=[comb['shape']],
                                            ascending=False)
        for ds_name, row in grouped_df.iterrows():
            xval = grouped_df[comb['x']][ds_name]
            yval = grouped_df[comb['y']][ds_name]
            sval = grouped_df[comb['shape']][ds_name] * attr_param[
                "incr_factor"][comb['shape']]
            ax.scatter(x=xval,
                       y=yval,
                       s=sval,
                       c=ds_color.loc[ds_name],
                       label="{}: time {} ms".format(
                           ds_name[:-4],
                           int(grouped_df["time_elapsed"][ds_name])))

        lgnd = plt.legend(scatterpoints=1, fontsize=10)
        for i in range(len(grouped_df)):
            lgnd.legendHandles[i]._sizes = [75]

    for ds in datasets:
        _, ax = plt.subplots()
        grouped_rfd =  test_df[test_df.ds_name == ds][['rfd_count','time_elapsed']]\
                            .groupby(by=['rfd_count']).mean()

        plot = grouped_rfd.plot(
            y="time_elapsed",
            marker='.',
            markersize=10,
            title=
            "Tempo impiegato rispetto al numero di RFD trovate nel dataset {}".
            format(ds[:-4]),
            ax=ax,
            legend=False)

        legend_dots = []
        for rfd_count, row in grouped_rfd.iterrows():
            legend_text = "{} RFD: tempo {} ms".format(int(rfd_count),
                                                       row['time_elapsed'])
            legend_dots.append(
                Line2D(range(1),
                       range(1),
                       linewidth=0,
                       color="white",
                       marker='o',
                       markerfacecolor="red",
                       label=legend_text))

        plot.set(xlabel="RFD trovate", ylabel='Tempo impiegato in ms')
        ax.legend(handles=legend_dots)
    plt.show()
Ejemplo n.º 4
0
 def test_something(self):
     """
     This method execute the algorithm defined in the class RFDDiscovery for each dataset in the directory resources and for
     each combination of rhs and lhs of them. For each execution of the algorithm, the method saves some information:
         - the dataset's name;
         - the dataset rows' number;
         - number of column;
         - the dataset file's size;
         - the algorithm's elapsed time;
         - the number of RFDs found;
         - the combination of rhs and lhs used for the iteration;
         - the number of the iteration executed on that combination.
     When the test will end, it will save all the information described above in a CSV file with the name
     <date of test>-result-c.csv. During the test, some log information will be printed.
     """
     test_count = 1
     logging.info("Starting test")
     result_df = pd.DataFrame(columns=cols)  # Data frame in which save results
     path = "../resources"  # path in which datasets are stored
     datasets = self.__load_all_files__(path)
     logging.info("All files loaded")
     for ds in datasets:
         logging.info("Starting test for dataset {}".format(ds))
         current_ds = path + "/" + ds                                # abs path for current dataset
         file_size = os.stat(current_ds).st_size                     # get file size
         logging.info("Checking separator and header for dataset {}".format(ds))
         try:
             c_sep, has_header = ut.check_sep_n_header(current_ds)
         except Exception as ex:
             logging.ERROR("Failed to load separator and header. Skipping test for {}".format(ds))
             pass
         logging.info("{} has separator '{}' and has {} header".format(ds, c_sep, "no" if has_header is None else ""))
         ds_shape = self.__get_ds_shape(current_ds, sep=c_sep, first_row_head=has_header)  # get df shape
         lhs_vs_rhs = ut.get_hs_combination(ds_shape['col'])     # combination for HS
         diff_matrix, elapsed_time_dist = self.__get_diff_mtx(c_sep, current_ds, has_header)
         for combination in lhs_vs_rhs:
             logging.info("Testing on combination: {}".format(str(combination)))
             dist_mtx = diff_matrix.split_sides(combination)
             for i in range(ITERATION_TIME):                         # repeat test X times
                 logging.info("Test no.{}".format(i))
                 start_time = time.time()                            # get t0
                 rfdd = RFDDiscovery(dist_mtx)
                 compiled = rfdd.is_compiled()
                 rfd_df = rfdd.get_rfds(rfdd.standard_algorithm, combination)
                 elapsed_time = time.time() - start_time             # get deltaT = now - t0
                 logging.info("RFDs discovery process finished")
                 rfd_count = rfd_df.shape[0]
                 logging.info("Discovered {} RFDs".format(rfd_count))
                 logging.info("Result added")
                 logging.info("Appending result to result's dataframe")
                 # append to result df
                 self.__append_result(ds, ds_shape['row'], ds_shape['col'], file_size, round(elapsed_time*1000,3),
                                      round(elapsed_time_dist*1000,3), rfd_count, str(combination), result_df)
                 test_count += 1
                 elapsed_time_dist = 0
         diff_mtx = None  # for free unused memory
     logging.info("Saving file")
     abs_path = os.path.abspath("../resources/test/{}-results-{}.csv"
                                .format(time.strftime("%Y-%m-%d_%H-%M-%S"), "c" if compiled else "p"))
     result_df.to_csv(abs_path, sep=";", header=cols, decimal=',')
     logging.info("File saved")