def getNbrFeatures(self, *filenames): ''' Get the number of features directly from the data file (in case we do not have an info file)''' if 'feat_num' not in self.info.keys(): self.getFormatData(filenames[0]) if self.info['format'] == 'dense': data = data_converter.file_to_array(filenames[0]) self.info['feat_num'] = len(data[0]) elif self.info['format'] == 'sparse': self.info['feat_num'] = 0 for filename in filenames: sparse_list = data_converter.sparse_file_to_sparse_list( filename) last_column = [ sparse_list[i][-1] for i in range(len(sparse_list)) ] last_column_feature = [a for (a, b) in last_column] self.info['feat_num'] = max(self.info['feat_num'], max(last_column_feature)) elif self.info['format'] == 'sparse_binary': self.info['feat_num'] = 0 for filename in filenames: data = data_converter.file_to_array(filename) last_column = [int(data[i][-1]) for i in range(len(data))] self.info['feat_num'] = max(self.info['feat_num'], max(last_column)) return self.info['feat_num']
def getTypeProblem(self, solution_filename): ''' Get the type of problem directly from the solution file (in case we do not have an info file)''' if 'task' not in self.info.keys(): solution = np.array( data_converter.file_to_array(solution_filename)) target_num = solution.shape[1] self.info['target_num'] = target_num if target_num == 1: # if we have only one column solution = np.ravel(solution) # flatten nbr_unique_values = len(np.unique(solution)) if nbr_unique_values < len(solution) / 8: # Classification self.info['label_num'] = nbr_unique_values if nbr_unique_values == 2: self.info['task'] = 'binary.classification' self.info['target_type'] = 'Binary' else: self.info['task'] = 'multiclass.classification' self.info['target_type'] = 'Categorical' else: # Regression self.info['label_num'] = 0 self.info['task'] = 'regression' self.info['target_type'] = 'Numerical' else: # Multilabel or multiclass self.info['label_num'] = target_num self.info['target_type'] = 'Binary' if any(item > 1 for item in map(np.sum, solution.astype(int))): self.info['task'] = 'multilabel.classification' else: self.info['task'] = 'multiclass.classification' return self.info['task']
def getFormatData(self, filename): ''' Get the data format directly from the data file (in case we do not have an info file)''' if 'format' in self.info.keys(): return self.info['format'] if 'is_sparse' in self.info.keys(): if self.info['is_sparse'] == 0: self.info['format'] = 'dense' else: data = data_converter.read_first_line(filename) if ':' in data[0]: self.info['format'] = 'sparse' else: self.info['format'] = 'sparse_binary' else: data = data_converter.file_to_array(filename) if ':' in data[0][0]: self.info['is_sparse'] = 1 self.info['format'] = 'sparse' else: nbr_columns = len(data[0]) for row in range(len(data)): if len(data[row]) != nbr_columns: self.info['format'] = 'sparse_binary' if 'format' not in self.info.keys(): self.info['format'] = 'dense' self.info['is_sparse'] = 0 return self.info['format']
def getTypeProblem(self, solution_filename): ''' Get the type of problem directly from the solution file (in case we do not have an info file)''' if 'task' not in self.info.keys(): solution = np.array(data_converter.file_to_array(solution_filename)) target_num = solution.shape[1] self.info['target_num'] = target_num if target_num == 1: # if we have only one column solution = np.ravel(solution) # flatten nbr_unique_values = len(np.unique(solution)) if nbr_unique_values < len(solution) / 8: # Classification self.info['label_num'] = nbr_unique_values if nbr_unique_values == 2: self.info['task'] = 'binary.classification' self.info['target_type'] = 'Binary' else: self.info['task'] = 'multiclass.classification' self.info['target_type'] = 'Categorical' else: # Regression self.info['label_num'] = 0 self.info['task'] = 'regression' self.info['target_type'] = 'Numerical' else: # Multilabel or multiclass self.info['label_num'] = target_num self.info['target_type'] = 'Binary' if any(item > 1 for item in map(np.sum, solution.astype(int))): self.info['task'] = 'multilabel.classification' else: self.info['task'] = 'multiclass.classification' return self.info['task']
def getNbrFeatures(self, *filenames): ''' Get the number of features directly from the data file (in case we do not have an info file)''' if 'feat_num' not in self.info.keys(): self.getFormatData(filenames[0]) if self.info['format'] == 'dense': data = data_converter.file_to_array(filenames[0]) self.info['feat_num'] = len(data[0]) elif self.info['format'] == 'sparse': self.info['feat_num'] = 0 for filename in filenames: sparse_list = data_converter.sparse_file_to_sparse_list(filename) last_column = [sparse_list[i][-1] for i in range(len(sparse_list))] last_column_feature = [a for (a, b) in last_column] self.info['feat_num'] = max(self.info['feat_num'], max(last_column_feature)) elif self.info['format'] == 'sparse_binary': self.info['feat_num'] = 0 for filename in filenames: data = data_converter.file_to_array(filename) last_column = [int(data[i][-1]) for i in range(len(data))] self.info['feat_num'] = max(self.info['feat_num'], max(last_column)) return self.info['feat_num']
def loadType(self, filename, verbose=True): ''' Get the variable types''' if verbose: print("========= Reading " + filename) start = time.time() type_list = [] if os.path.isfile(filename): type_list = data_converter.file_to_array(filename, verbose=False) else: n = self.info['feat_num'] type_list = [self.info['feat_type']] * n type_list = np.array(type_list).ravel() end = time.time() if verbose: print("[+] Success in %5.2f sec" % (end - start)) return type_list
def loadType(self, filename, verbose=True): ''' Get the variable types''' if verbose: print("========= Reading " + filename) start = time.time() type_list = [] if os.path.isfile(filename): type_list = data_converter.file_to_array(filename, verbose=False) else: n = self.info['feat_num'] type_list = [self.info['feat_type']] * n type_list = np.array(type_list).ravel() end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return type_list