def extract_hss(self, cols_count, lhs, rhs): # You cannot have len(rhs) > 1, don't check it """ Given the lhs and rhs from the command line parameters, and the column's number of the dataset, it creates various combinations of rhs and lhs according to the format of this two parameters. If the format of this two parameters is not accordant with the possible combination on rhs and lhs in the command line arguments described by the README, the program will print an error message and it will end. The program return a list of dict, where each dict contains the indexes of the attributes on the lhs with the key 'lhs' and the index of the attribute on the rhs with the key 'rhs'. :param cols_count: the column's number :type cols_count: int :param lhs: list of a valid columns' indexes containing the dataset's attributes positioned in the lhs :type lhs: list :param rhs: list of a valid column's index containing the dataset's attribute positioned in the rhs :type rhs: list :return: one or more combination of attribute in the rhs and lhs :rtype: list """ if rhs == [] and lhs == []: # each combination case hss = ut.get_hs_combination(cols_count) elif rhs == [] and not lhs == []: # error case print("You have to specify at least one RHS attribute") sys.exit(-1) elif not rhs == [] and lhs == []: # only rhs specified case cols_index = list(range(cols_count)) if not rhs[0] in cols_index: print("RHS index is out of bound. Specify a valid value") sys.exit(-1) hss = list() hss.append({'rhs': rhs, 'lhs': cols_index[:rhs[0]] + cols_index[rhs[0] + 1:]}) else: hss = list() hss.append({'rhs': rhs, 'lhs': lhs}) return hss
def test_something(self): """ This method execute the algorithm defined in the class RFDDiscovery for each dataset in the directory resources and for each combination of rhs and lhs of them. For each execution of the algorithm, the method saves some information: - the dataset's name; - the dataset rows' number; - number of column; - the dataset file's size; - the algorithm's elapsed time; - the number of RFDs found; - the combination of rhs and lhs used for the iteration; - the number of the iteration executed on that combination. When the test will end, it will save all the information described above in a CSV file with the name <date of test>-result-c.csv. During the test, some log information will be printed. """ test_count = 1 logging.info("Starting test") result_df = pd.DataFrame(columns=cols) # Data frame in which save results path = "../resources" # path in which datasets are stored datasets = self.__load_all_files__(path) logging.info("All files loaded") for ds in datasets: logging.info("Starting test for dataset {}".format(ds)) current_ds = path + "/" + ds # abs path for current dataset file_size = os.stat(current_ds).st_size # get file size logging.info("Checking separator and header for dataset {}".format(ds)) try: c_sep, has_header = ut.check_sep_n_header(current_ds) except Exception as ex: logging.ERROR("Failed to load separator and header. Skipping test for {}".format(ds)) pass logging.info("{} has separator '{}' and has {} header".format(ds, c_sep, "no" if has_header is None else "")) ds_shape = self.__get_ds_shape(current_ds, sep=c_sep, first_row_head=has_header) # get df shape lhs_vs_rhs = ut.get_hs_combination(ds_shape['col']) # combination for HS diff_matrix, elapsed_time_dist = self.__get_diff_mtx(c_sep, current_ds, has_header) for combination in lhs_vs_rhs: logging.info("Testing on combination: {}".format(str(combination))) dist_mtx = diff_matrix.split_sides(combination) for i in range(ITERATION_TIME): # repeat test X times logging.info("Test no.{}".format(i)) start_time = time.time() # get t0 rfdd = RFDDiscovery(dist_mtx) compiled = rfdd.is_compiled() rfd_df = rfdd.get_rfds(rfdd.standard_algorithm, combination) elapsed_time = time.time() - start_time # get deltaT = now - t0 logging.info("RFDs discovery process finished") rfd_count = rfd_df.shape[0] logging.info("Discovered {} RFDs".format(rfd_count)) logging.info("Result added") logging.info("Appending result to result's dataframe") # append to result df self.__append_result(ds, ds_shape['row'], ds_shape['col'], file_size, round(elapsed_time*1000,3), round(elapsed_time_dist*1000,3), rfd_count, str(combination), result_df) test_count += 1 elapsed_time_dist = 0 diff_mtx = None # for free unused memory logging.info("Saving file") abs_path = os.path.abspath("../resources/test/{}-results-{}.csv" .format(time.strftime("%Y-%m-%d_%H-%M-%S"), "c" if compiled else "p")) result_df.to_csv(abs_path, sep=";", header=cols, decimal=',') logging.info("File saved")