def getInfo(self, filename, verbose=True): """ Get all information {attribute = value} pairs from the filename (public.info file), if it exists, otherwise, output default values""" if filename == None: basename = self.basename input_dir = self.input_dir else: basename = os.path.basename(filename).split("_")[0] input_dir = os.path.dirname(filename) if os.path.exists(filename): self.getInfoFromFile(filename) vprint(verbose, "Info file found : " + os.path.abspath(filename)) # Finds the data format ('dense', 'sparse', or 'sparse_binary') self.getFormatData(os.path.join(input_dir, basename + "_train.data")) else: vprint(verbose, "Info file NOT found : " + os.path.abspath(filename)) # Hopefully this never happens because this is done in a very inefficient way # reading the data multiple times... self.info["usage"] = "No Info File" self.info["name"] = basename # Get the data format and sparsity self.getFormatData(os.path.join(input_dir, basename + "_train.data")) # Assume no categorical variable and no missing value (we'll deal with that later) self.info["has_categorical"] = 0 self.info["has_missing"] = 0 # Get the target number, label number, target type and task self.getTypeProblem(os.path.join(input_dir, basename + "_train.solution")) if self.info["task"] == "regression": self.info["metric"] = "r2_metric" else: self.info["metric"] = "auc_metric" # Feature type: Numerical, Categorical, or Binary # Can also be determined from [filename].type self.info["feat_type"] = "Mixed" # Get the number of features and patterns self.getNbrFeatures( os.path.join(input_dir, basename + "_train.data"), os.path.join(input_dir, basename + "_test.data"), os.path.join(input_dir, basename + "_valid.data"), ) self.getNbrPatterns(basename, input_dir, "train") self.getNbrPatterns(basename, input_dir, "valid") self.getNbrPatterns(basename, input_dir, "test") # Set default time budget self.info["time_budget"] = 600 return self.info
def loadLabel(self, filename, verbose=True): """ Get the solution/truth values""" if verbose: print(("========= Reading " + filename)) start = time.time() if self.use_pickle and os.path.exists(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")): with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint( verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), ) return pickle.load(pickle_file) if "task" not in list(self.info.keys()): self.getTypeProblem(filename) # IG: Here change to accommodate the new multiclass label format if self.info["task"] == "multilabel.classification": label = autokit.data_io.data(filename) elif self.info["task"] == "multiclass.classification": label = autokit.data_converter.convert_to_num(autokit.data_io.data(filename)) else: label = np.ravel(autokit.data_io.data(filename)) # get a column vector # label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector if self.use_pickle: with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint( verbose, "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), ) p = pickle.Pickler(pickle_file) p.fast = True p.dump(label) end = time.time() if verbose: print(("[+] Success in %5.2f sec" % (end - start))) return label
def loadData(self, filename, verbose=True, replace_missing=True): """ Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse""" if verbose: print(("========= Reading " + filename)) start = time.time() if self.use_pickle and os.path.exists(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")): with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint( verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), ) return pickle.load(pickle_file) if "format" not in list(self.info.keys()): self.getFormatData(filename) if "feat_num" not in list(self.info.keys()): self.getNbrFeatures(filename) data_func = { "dense": autokit.data_io.data, "sparse": autokit.data_io.data_sparse, "sparse_binary": autokit.data_io.data_binary_sparse, } data = data_func[self.info["format"]](filename, self.info["feat_num"]) # INPORTANT: when we replace missing values we double the number of variables if self.info["format"] == "dense" and replace_missing and np.any(list(map(np.isnan, data))): vprint(verbose, "Replace missing values by 0 (slow, sorry)") data = autokit.data_converter.replace_missing(data) if self.use_pickle: with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint( verbose, "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), ) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print(("[+] Success in %5.2f sec" % (end - start))) return data
#### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) print(('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n')) data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames)>0: vprint( verbose, "************************************************************************") vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******") vprint( verbose, "************************************************************************") OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if OK: vprint( verbose, "[+] Success") datanames = [] # Do not proceed with learning and testing else: vprint( verbose, "======== Some missing results on current datasets!") vprint( verbose, "======== Proceeding to train/test:\n") # =================== End @RESULT SUBMISSION (KEEP THIS) ================== # ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) ================= overall_time_budget = 0 for basename in datanames: # Loop over datasets