Exemple #1
0
 def getInfo(self, filename, verbose=True):
     """ Get all information {attribute = value} pairs from the filename (public.info file), 
           if it exists, otherwise, output default values"""
     if filename == None:
         basename = self.basename
         input_dir = self.input_dir
     else:
         basename = os.path.basename(filename).split("_")[0]
         input_dir = os.path.dirname(filename)
     if os.path.exists(filename):
         self.getInfoFromFile(filename)
         vprint(verbose, "Info file found : " + os.path.abspath(filename))
         # Finds the data format ('dense', 'sparse', or 'sparse_binary')
         self.getFormatData(os.path.join(input_dir, basename + "_train.data"))
     else:
         vprint(verbose, "Info file NOT found : " + os.path.abspath(filename))
         # Hopefully this never happens because this is done in a very inefficient way
         # reading the data multiple times...
         self.info["usage"] = "No Info File"
         self.info["name"] = basename
         # Get the data format and sparsity
         self.getFormatData(os.path.join(input_dir, basename + "_train.data"))
         # Assume no categorical variable and no missing value (we'll deal with that later)
         self.info["has_categorical"] = 0
         self.info["has_missing"] = 0
         # Get the target number, label number, target type and task
         self.getTypeProblem(os.path.join(input_dir, basename + "_train.solution"))
         if self.info["task"] == "regression":
             self.info["metric"] = "r2_metric"
         else:
             self.info["metric"] = "auc_metric"
         # Feature type: Numerical, Categorical, or Binary
         # Can also be determined from [filename].type
         self.info["feat_type"] = "Mixed"
         # Get the number of features and patterns
         self.getNbrFeatures(
             os.path.join(input_dir, basename + "_train.data"),
             os.path.join(input_dir, basename + "_test.data"),
             os.path.join(input_dir, basename + "_valid.data"),
         )
         self.getNbrPatterns(basename, input_dir, "train")
         self.getNbrPatterns(basename, input_dir, "valid")
         self.getNbrPatterns(basename, input_dir, "test")
         # Set default time budget
         self.info["time_budget"] = 600
     return self.info
Exemple #2
0
    def loadLabel(self, filename, verbose=True):
        """ Get the solution/truth values"""
        if verbose:
            print(("========= Reading " + filename))
        start = time.time()
        if self.use_pickle and os.path.exists(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")):
            with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
                vprint(
                    verbose,
                    "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"),
                )
                return pickle.load(pickle_file)
        if "task" not in list(self.info.keys()):
            self.getTypeProblem(filename)

        # IG: Here change to accommodate the new multiclass label format
        if self.info["task"] == "multilabel.classification":
            label = autokit.data_io.data(filename)
        elif self.info["task"] == "multiclass.classification":
            label = autokit.data_converter.convert_to_num(autokit.data_io.data(filename))
        else:
            label = np.ravel(autokit.data_io.data(filename))  # get a column vector
            # label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector

        if self.use_pickle:
            with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
                vprint(
                    verbose,
                    "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"),
                )
                p = pickle.Pickler(pickle_file)
                p.fast = True
                p.dump(label)
        end = time.time()
        if verbose:
            print(("[+] Success in %5.2f sec" % (end - start)))
        return label
Exemple #3
0
    def loadData(self, filename, verbose=True, replace_missing=True):
        """ Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse"""
        if verbose:
            print(("========= Reading " + filename))
        start = time.time()
        if self.use_pickle and os.path.exists(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")):
            with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
                vprint(
                    verbose,
                    "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"),
                )
                return pickle.load(pickle_file)
        if "format" not in list(self.info.keys()):
            self.getFormatData(filename)
        if "feat_num" not in list(self.info.keys()):
            self.getNbrFeatures(filename)

        data_func = {
            "dense": autokit.data_io.data,
            "sparse": autokit.data_io.data_sparse,
            "sparse_binary": autokit.data_io.data_binary_sparse,
        }

        data = data_func[self.info["format"]](filename, self.info["feat_num"])

        # INPORTANT: when we replace missing values we double the number of variables

        if self.info["format"] == "dense" and replace_missing and np.any(list(map(np.isnan, data))):
            vprint(verbose, "Replace missing values by 0 (slow, sorry)")
            data = autokit.data_converter.replace_missing(data)
        if self.use_pickle:
            with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
                vprint(
                    verbose,
                    "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"),
                )
                p = pickle.Pickler(pickle_file)
                p.fast = True
                p.dump(data)
        end = time.time()
        if verbose:
            print(("[+] Success in %5.2f sec" % (end - start)))
        return data
Exemple #4
0
    
    #### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(input_dir)
    
    #### DEBUG MODE: Show dataset list and STOP
    if debug_mode>=3:
        data_io.show_io(input_dir, output_dir)
        print(('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n'))        	
        data_io.write_list(datanames)      
        datanames = [] # Do not proceed with learning and testing
        
    # ==================== @RESULT SUBMISSION (KEEP THIS) =====================
    # Always keep this code to enable result submission of pre-calculated results
    # deposited in the res/ subdirectory.
    if len(datanames)>0:
        vprint( verbose,  "************************************************************************")
        vprint( verbose,  "****** Attempting to copy files (from res/) for RESULT submission ******")
        vprint( verbose,  "************************************************************************")
        OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE!
        if OK: 
            vprint( verbose,  "[+] Success")
            datanames = [] # Do not proceed with learning and testing
        else:
            vprint( verbose, "======== Some missing results on current datasets!")
            vprint( verbose, "======== Proceeding to train/test:\n")
    # =================== End @RESULT SUBMISSION (KEEP THIS) ==================

    # ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) ================= 
    overall_time_budget = 0
    for basename in datanames: # Loop over datasets