Ejemplo n.º 1
0
    def loadLabel(self, filename, verbose=True):
        ''' Get the solution/truth values'''
        if verbose:  print("========= Reading " + filename)
        start = time.time()
        if self.use_pickle and os.path.exists(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")):
            with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
                vprint(verbose,
                       "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"))
                return pickle.load(pickle_file)
        if 'task' not in self.info.keys():
            self.getTypeProblem(filename)

            # IG: Here change to accommodate the new multiclass label format
        if self.info['task'] == 'multilabel.classification':
            label = data_io.data(filename)
        elif self.info['task'] == 'multiclass.classification':
            label = data_converter.convert_to_num(data_io.data(filename))
        else:
            label = np.ravel(data_io.data(filename))  # get a column vector
        # label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector

        if self.use_pickle:
            with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
                vprint(verbose,
                       "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"))
                p = pickle.Pickler(pickle_file)
                p.fast = True
                p.dump(label)
        end = time.time()
        if verbose:  print( "[+] Success in %5.2f sec" % (end - start))
        return label
Ejemplo n.º 2
0
    def loadData(self, filename, verbose=True, replace_missing=True):
        """
        Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse
        Potentially does not load the data if it is too large
        """
        logger.info("Reading %s", filename)
        start = time.time()

        if not os.path.exists(filename):
            return None
        if 'format' not in self.info.keys():
            self.getFormatData(filename)
        if 'feat_num' not in self.info.keys():
            self.getNbrFeatures(filename)

        data_func = {
            'dense': data_io.data,
            'sparse': data_io.data_sparse,
            'sparse_binary': data_io.data_binary_sparse
        }

        data = data_func[self.info['format']](filename, self.info['feat_num'])

        # INPORTANT: when we replace missing values we double the number of variables

        if self.info['format'] == 'dense' and replace_missing and np.any(
                map(np.isnan, data)):
            vprint(verbose, "Replace missing values by 0 (slow, sorry)")
            data = data_converter.replace_missing(data)

        end = time.time()
        if verbose: print("[+] Success in %5.2f sec" % (end - start))
        return data
Ejemplo n.º 3
0
    def loadData(self, filename, verbose=True, replace_missing=True):
        """
        Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse
        Potentially does not load the data if it is too large
        """
        logger.info("Reading %s", filename)
        start = time.time()

        if not os.path.exists(filename):
            return None
        if 'format' not in self.info.keys():
            self.getFormatData(filename)
        if 'feat_num' not in self.info.keys():
            self.getNbrFeatures(filename)

        data_func = {'dense': data_io.data, 'sparse': data_io.data_sparse, 'sparse_binary': data_io.data_binary_sparse}

        data = data_func[self.info['format']](filename, self.info['feat_num'])

        # INPORTANT: when we replace missing values we double the number of variables

        if self.info['format'] == 'dense' and replace_missing and np.any(map(np.isnan, data)):
            vprint(verbose, "Replace missing values by 0 (slow, sorry)")
            data = data_converter.replace_missing(data)

        end = time.time()
        if verbose:  print( "[+] Success in %5.2f sec" % (end - start))
        return data
Ejemplo n.º 4
0
 def getInfo(self, filename, verbose=True):
     ''' Get all information {attribute = value} pairs from the filename (public.info file),
               if it exists, otherwise, output default values'''
     if filename == None:
         basename = self.basename
         input_dir = self.input_dir
     else:
         basename = os.path.basename(filename).rsplit('_')[0]
         input_dir = os.path.dirname(filename)
     if os.path.exists(filename):
         self.getInfoFromFile(filename)
         vprint(verbose, "Info file found : " + os.path.abspath(filename))
         # Finds the data format ('dense', 'sparse', or 'sparse_binary')
         self.getFormatData(
             os.path.join(input_dir, basename + '_train.data'))
     else:
         vprint(verbose,
                "Info file NOT found : " + os.path.abspath(filename))
         # Hopefully this never happens because this is done in a very inefficient way
         # reading the data multiple times...
         self.info['usage'] = 'No Info File'
         self.info['name'] = basename
         # Get the data format and sparsity
         self.getFormatData(
             os.path.join(input_dir, basename + '_train.data'))
         # Assume no categorical variable and no missing value (we'll deal with that later)
         self.info['has_categorical'] = 0
         self.info['has_missing'] = 0
         # Get the target number, label number, target type and task
         self.getTypeProblem(
             os.path.join(input_dir, basename + '_train.solution'))
         if self.info['task'] == 'regression':
             self.info['metric'] = 'r2_metric'
         else:
             self.info['metric'] = 'auc_metric'
         # Feature type: Numerical, Categorical, or Binary
         # Can also be determined from [filename].type
         self.info['feat_type'] = 'Mixed'
         # Get the number of features and patterns
         self.getNbrFeatures(
             os.path.join(input_dir, basename + '_train.data'),
             os.path.join(input_dir, basename + '_test.data'),
             os.path.join(input_dir, basename + '_valid.data'))
         self.getNbrPatterns(basename, input_dir, 'train')
         self.getNbrPatterns(basename, input_dir, 'valid')
         self.getNbrPatterns(basename, input_dir, 'test')
         # Set default time budget
         self.info['time_budget'] = 600
     return self.info
Ejemplo n.º 5
0
 def getInfo(self, filename, verbose=True):
     ''' Get all information {attribute = value} pairs from the filename (public.info file),
               if it exists, otherwise, output default values'''
     if filename == None:
         basename = self.basename
         input_dir = self.input_dir
     else:
         basename = os.path.basename(filename).rsplit('_')[0]
         input_dir = os.path.dirname(filename)
     if os.path.exists(filename):
         self.getInfoFromFile(filename)
         vprint(verbose, "Info file found : " + os.path.abspath(filename))
         # Finds the data format ('dense', 'sparse', or 'sparse_binary')
         self.getFormatData(os.path.join(input_dir, basename + '_train.data'))
     else:
         vprint(verbose, "Info file NOT found : " + os.path.abspath(filename))
         # Hopefully this never happens because this is done in a very inefficient way
         # reading the data multiple times...
         self.info['usage'] = 'No Info File'
         self.info['name'] = basename
         # Get the data format and sparsity
         self.getFormatData(os.path.join(input_dir, basename + '_train.data'))
         # Assume no categorical variable and no missing value (we'll deal with that later)
         self.info['has_categorical'] = 0
         self.info['has_missing'] = 0
         # Get the target number, label number, target type and task
         self.getTypeProblem(os.path.join(input_dir, basename + '_train.solution'))
         if self.info['task'] == 'regression':
             self.info['metric'] = 'r2_metric'
         else:
             self.info['metric'] = 'auc_metric'
         # Feature type: Numerical, Categorical, or Binary
         # Can also be determined from [filename].type
         self.info['feat_type'] = 'Mixed'
         # Get the number of features and patterns
         self.getNbrFeatures(os.path.join(input_dir, basename + '_train.data'),
                             os.path.join(input_dir, basename + '_test.data'),
                             os.path.join(input_dir, basename + '_valid.data'))
         self.getNbrPatterns(basename, input_dir, 'train')
         self.getNbrPatterns(basename, input_dir, 'valid')
         self.getNbrPatterns(basename, input_dir, 'test')
         # Set default time budget
         self.info['time_budget'] = 600
     return self.info
Ejemplo n.º 6
0
    def loadLabel(self, filename, verbose=True):
        ''' Get the solution/truth values'''
        if verbose: print("========= Reading " + filename)
        start = time.time()
        if self.use_pickle and os.path.exists(
                os.path.join(self.tmp_dir,
                             os.path.basename(filename) + ".pickle")):
            with open(
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"),
                    "r") as pickle_file:
                vprint(
                    verbose, "Loading pickle file : " +
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"))
                return pickle.load(pickle_file)
        if 'task' not in self.info.keys():
            self.getTypeProblem(filename)

            # IG: Here change to accommodate the new multiclass label format
        if self.info['task'] == 'multilabel.classification':
            label = data_io.data(filename)
        elif self.info['task'] == 'multiclass.classification':
            label = data_converter.convert_to_num(data_io.data(filename))
        else:
            label = np.ravel(data_io.data(filename))  # get a column vector
        # label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector

        if self.use_pickle:
            with open(
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"),
                    "wb") as pickle_file:
                vprint(
                    verbose, "Saving pickle file : " +
                    os.path.join(self.tmp_dir,
                                 os.path.basename(filename) + ".pickle"))
                p = pickle.Pickler(pickle_file)
                p.fast = True
                p.dump(label)
        end = time.time()
        if verbose: print("[+] Success in %5.2f sec" % (end - start))
        return label
Ejemplo n.º 7
0
    else:
        input_dir = argv[1]
        output_dir = os.path.abspath(argv[2])
    # Move old results and create a new output directory
    data_io.mvdir(output_dir, output_dir + '_' + the_date)
    data_io.mkdir(output_dir)

    # ### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(input_dir)

    # ==================== @RESULT SUBMISSION (KEEP THIS) =====================
    # Always keep this code to enable result submission of pre-calculated results
    # deposited in the res/ subdirectory.
    if len(datanames) > 0:
        vprint(
            verbose,
            "************************************************************************"
        )
        vprint(
            verbose,
            "****** Attempting to copy files (from res/) for RESULT submission ******"
        )
        vprint(
            verbose,
            "************************************************************************"
        )
        OK = data_io.copy_results(datanames, res_dir, output_dir,
                                  verbose)  # DO NOT REMOVE!
        if OK:
            vprint(verbose, "[+] Success")
            datanames = []  # Do not proceed with learning and testing
        else:
Ejemplo n.º 8
0
        output_dir = default_output_dir
    else:
        input_dir = argv[1]
        output_dir = os.path.abspath(argv[2])
    # Move old results and create a new output directory 
    data_io.mvdir(output_dir, output_dir+'_'+the_date) 
    data_io.mkdir(output_dir) 
    
    # ### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(input_dir)

    # ==================== @RESULT SUBMISSION (KEEP THIS) =====================
    # Always keep this code to enable result submission of pre-calculated results
    # deposited in the res/ subdirectory.
    if len(datanames) > 0:
        vprint(verbose,  "************************************************************************")
        vprint(verbose,  "****** Attempting to copy files (from res/) for RESULT submission ******")
        vprint(verbose,  "************************************************************************")
        OK = data_io.copy_results(datanames, res_dir, output_dir, verbose)  # DO NOT REMOVE!
        if OK: 
            vprint(verbose,  "[+] Success")
            datanames = []  # Do not proceed with learning and testing
        else:
            vprint(verbose, "======== Some missing results on current datasets!")
            vprint(verbose, "======== Proceeding to train/test:\n")
    # =================== End @RESULT SUBMISSION (KEEP THIS) ==================

    if zipme and not running_on_codalab:
        vprint(verbose,  "========= Zipping this directory to prepare for submit ==============")
        ignoredirs = [os.path.abspath(x) for x in glob.glob('./output_*')]
Ejemplo n.º 9
0
    datanames = data_io.inventory_data(input_dir)

    #### DEBUG MODE: Show dataset list and STOP
    if debug_mode >= 3:
        data_io.show_io(input_dir, output_dir)
        print('\n****** Sample code version ' + str(version) + ' ******\n\n' +
              '========== DATASETS ==========\n')
        data_io.write_list(datanames)
        datanames = []  # Do not proceed with learning and testing

    # ==================== @RESULT SUBMISSION (KEEP THIS) =====================
    # Always keep this code to enable result submission of pre-calculated results
    # deposited in the res/ subdirectory.
    if len(datanames) > 0:
        vprint(
            verbose,
            "************************************************************************"
        )
        vprint(
            verbose,
            "****** Attempting to copy files (from res/) for RESULT submission ******"
        )
        vprint(
            verbose,
            "************************************************************************"
        )
        OK = data_io.copy_results(datanames, res_dir, output_dir,
                                  verbose)  # DO NOT REMOVE!
        if OK:
            vprint(verbose, "[+] Success")
            datanames = []  # Do not proceed with learning and testing
        else: