예제 #1
0
 def getNbrFeatures(self, *filenames):
     ''' Get the number of features directly from the data file (in case we do not have an info file)'''
     if 'feat_num' not in self.info.keys():
         self.getFormatData(filenames[0])
         if self.info['format'] == 'dense':
             data = data_converter.file_to_array(filenames[0])
             self.info['feat_num'] = len(data[0])
         elif self.info['format'] == 'sparse':
             self.info['feat_num'] = 0
             for filename in filenames:
                 sparse_list = data_converter.sparse_file_to_sparse_list(
                     filename)
                 last_column = [
                     sparse_list[i][-1] for i in range(len(sparse_list))
                 ]
                 last_column_feature = [a for (a, b) in last_column]
                 self.info['feat_num'] = max(self.info['feat_num'],
                                             max(last_column_feature))
         elif self.info['format'] == 'sparse_binary':
             self.info['feat_num'] = 0
             for filename in filenames:
                 data = data_converter.file_to_array(filename)
                 last_column = [int(data[i][-1]) for i in range(len(data))]
                 self.info['feat_num'] = max(self.info['feat_num'],
                                             max(last_column))
     return self.info['feat_num']
예제 #2
0
 def getTypeProblem(self, solution_filename):
     ''' Get the type of problem directly from the solution file (in case we do not have an info file)'''
     if 'task' not in self.info.keys():
         solution = np.array(
             data_converter.file_to_array(solution_filename))
         target_num = solution.shape[1]
         self.info['target_num'] = target_num
         if target_num == 1:  # if we have only one column
             solution = np.ravel(solution)  # flatten
             nbr_unique_values = len(np.unique(solution))
             if nbr_unique_values < len(solution) / 8:
                 # Classification
                 self.info['label_num'] = nbr_unique_values
                 if nbr_unique_values == 2:
                     self.info['task'] = 'binary.classification'
                     self.info['target_type'] = 'Binary'
                 else:
                     self.info['task'] = 'multiclass.classification'
                     self.info['target_type'] = 'Categorical'
             else:
                 # Regression
                 self.info['label_num'] = 0
                 self.info['task'] = 'regression'
                 self.info['target_type'] = 'Numerical'
         else:
             # Multilabel or multiclass
             self.info['label_num'] = target_num
             self.info['target_type'] = 'Binary'
             if any(item > 1 for item in map(np.sum, solution.astype(int))):
                 self.info['task'] = 'multilabel.classification'
             else:
                 self.info['task'] = 'multiclass.classification'
     return self.info['task']
예제 #3
0
 def getFormatData(self, filename):
     ''' Get the data format directly from the data file (in case we do not have an info file)'''
     if 'format' in self.info.keys():
         return self.info['format']
     if 'is_sparse' in self.info.keys():
         if self.info['is_sparse'] == 0:
             self.info['format'] = 'dense'
         else:
             data = data_converter.read_first_line(filename)
             if ':' in data[0]:
                 self.info['format'] = 'sparse'
             else:
                 self.info['format'] = 'sparse_binary'
     else:
         data = data_converter.file_to_array(filename)
         if ':' in data[0][0]:
             self.info['is_sparse'] = 1
             self.info['format'] = 'sparse'
         else:
             nbr_columns = len(data[0])
             for row in range(len(data)):
                 if len(data[row]) != nbr_columns:
                     self.info['format'] = 'sparse_binary'
             if 'format' not in self.info.keys():
                 self.info['format'] = 'dense'
                 self.info['is_sparse'] = 0
     return self.info['format']
예제 #4
0
 def getTypeProblem(self, solution_filename):
     ''' Get the type of problem directly from the solution file (in case we do not have an info file)'''
     if 'task' not in self.info.keys():
         solution = np.array(data_converter.file_to_array(solution_filename))
         target_num = solution.shape[1]
         self.info['target_num'] = target_num
         if target_num == 1:  # if we have only one column
             solution = np.ravel(solution)  # flatten
             nbr_unique_values = len(np.unique(solution))
             if nbr_unique_values < len(solution) / 8:
                 # Classification
                 self.info['label_num'] = nbr_unique_values
                 if nbr_unique_values == 2:
                     self.info['task'] = 'binary.classification'
                     self.info['target_type'] = 'Binary'
                 else:
                     self.info['task'] = 'multiclass.classification'
                     self.info['target_type'] = 'Categorical'
             else:
                 # Regression
                 self.info['label_num'] = 0
                 self.info['task'] = 'regression'
                 self.info['target_type'] = 'Numerical'
         else:
             # Multilabel or multiclass
             self.info['label_num'] = target_num
             self.info['target_type'] = 'Binary'
             if any(item > 1 for item in map(np.sum, solution.astype(int))):
                 self.info['task'] = 'multilabel.classification'
             else:
                 self.info['task'] = 'multiclass.classification'
     return self.info['task']
예제 #5
0
 def getFormatData(self, filename):
     ''' Get the data format directly from the data file (in case we do not have an info file)'''
     if 'format' in self.info.keys():
         return self.info['format']
     if 'is_sparse' in self.info.keys():
         if self.info['is_sparse'] == 0:
             self.info['format'] = 'dense'
         else:
             data = data_converter.read_first_line(filename)
             if ':' in data[0]:
                 self.info['format'] = 'sparse'
             else:
                 self.info['format'] = 'sparse_binary'
     else:
         data = data_converter.file_to_array(filename)
         if ':' in data[0][0]:
             self.info['is_sparse'] = 1
             self.info['format'] = 'sparse'
         else:
             nbr_columns = len(data[0])
             for row in range(len(data)):
                 if len(data[row]) != nbr_columns:
                     self.info['format'] = 'sparse_binary'
             if 'format' not in self.info.keys():
                 self.info['format'] = 'dense'
                 self.info['is_sparse'] = 0
     return self.info['format']
예제 #6
0
 def getNbrFeatures(self, *filenames):
     ''' Get the number of features directly from the data file (in case we do not have an info file)'''
     if 'feat_num' not in self.info.keys():
         self.getFormatData(filenames[0])
         if self.info['format'] == 'dense':
             data = data_converter.file_to_array(filenames[0])
             self.info['feat_num'] = len(data[0])
         elif self.info['format'] == 'sparse':
             self.info['feat_num'] = 0
             for filename in filenames:
                 sparse_list = data_converter.sparse_file_to_sparse_list(filename)
                 last_column = [sparse_list[i][-1] for i in range(len(sparse_list))]
                 last_column_feature = [a for (a, b) in last_column]
                 self.info['feat_num'] = max(self.info['feat_num'], max(last_column_feature))
         elif self.info['format'] == 'sparse_binary':
             self.info['feat_num'] = 0
             for filename in filenames:
                 data = data_converter.file_to_array(filename)
                 last_column = [int(data[i][-1]) for i in range(len(data))]
                 self.info['feat_num'] = max(self.info['feat_num'], max(last_column))
     return self.info['feat_num']
예제 #7
0
 def loadType(self, filename, verbose=True):
     ''' Get the variable types'''
     if verbose: print("========= Reading " + filename)
     start = time.time()
     type_list = []
     if os.path.isfile(filename):
         type_list = data_converter.file_to_array(filename, verbose=False)
     else:
         n = self.info['feat_num']
         type_list = [self.info['feat_type']] * n
     type_list = np.array(type_list).ravel()
     end = time.time()
     if verbose: print("[+] Success in %5.2f sec" % (end - start))
     return type_list
예제 #8
0
 def loadType(self, filename, verbose=True):
     ''' Get the variable types'''
     if verbose:  print("========= Reading " + filename)
     start = time.time()
     type_list = []
     if os.path.isfile(filename):
         type_list = data_converter.file_to_array(filename, verbose=False)
     else:
         n = self.info['feat_num']
         type_list = [self.info['feat_type']] * n
     type_list = np.array(type_list).ravel()
     end = time.time()
     if verbose:  print( "[+] Success in %5.2f sec" % (end - start))
     return type_list