def loadLabel (self, filename, verbose=True): ''' Get the solution/truth values''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'task' not in self.info.keys(): self.getTypeProblem(filename) # IG: Here change to accommodate the new multiclass label format if self.info['task'] == 'multilabel.classification': label = data_io.data(filename) elif self.info['task'] == 'multiclass.classification': label = data_io.data(filename) # IG: I changed that because it was too confusing. #label = data_converter.convert_to_num(data_io.data(filename)) else: label = np.ravel(data_io.data(filename)) # get a column vector #label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(label) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return label
def loadData (self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, sparse_binary''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse} data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)): pass #vprint (verbose, "Replace missing values by 0 (slow, sorry)") #data = data_converter.replace_missing(data) if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return data
def __init__(self, datatype="input", data_file="", verbose=False, cache_file=""): '''Constructor''' self.version = "1" self.datatype = datatype self.verbose = verbose self.cache_file = cache_file # To save/reload data in binary format (only if not empty) if not cache_file: self.use_pickle = False else: self.use_pickle = True self.X = np.array([]) self.t = np.array([]) self.col_names = [] self.ycol0 = 0 self.t0 = 0 self.now = 0 self.stride = 0 self.horizon = 0 vprint(self.verbose, "DataManager :: Version = " + self.version) if data_file: self.loadData(data_file)
def load(self, path=""): ''' Reload model.''' if not path: path = self.model_dir vprint(self.verbose, "Model :: ========= Loading model from " + path) self = pickle.load(open(os.path.join(path, '_model.pickle'), "w")) return self
def loadLabel (self, filename, verbose=True): ''' Get the solution/truth values''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'task' not in self.info.keys(): self.getTypeProblem(filename) # IG: Here change to accommodate the new multiclass label format if self.info['task'] == 'multilabel.classification': label = data_io.data(filename) elif self.info['task'] == 'multiclass.classification': label = data_converter.convert_to_num(data_io.data(filename)) else: label = np.ravel(data_io.data(filename)) # get a column vector #label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(label) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return label
def reloadData(self, filename="", data_dir=""): ''' Reload data in pickle format.''' success = True vprint(self.verbose, "Data Manager :: ========= Reloading data from " + filename) start = time.time() # Write some code to reload the data temp =[] try: if filename.endswith('h5'): with h5py.File(os.path.join(data_dir, filename), 'r') as f: self.X = f['X'][:] self.t = f['t'][:] elif filename.endswith('pickle'): with open(os.path.join(data_dir, filename), 'rb') as f: temp = pickle.load(f) self.X = temp['X'] self.t = temp['t'] vprint(self.verbose, filename) else: success = False vprint(self.verbose, "[-] No such file extension." + filename) except Exception as e: vprint (self.verbose, e) success = False end = time.time() if success: vprint(self.verbose, "[+] Success in %5.2f sec" % (end - start)) return success
def __init__(self, hyper_param=[], path="", verbose=True): ''' Define whatever data member you need (model paramaters and hyper-parameters). hyper_param is a tuple. path specifies the directory where models are saved/loaded.''' self.version = "Persitent" self.hyper_param = hyper_param self.model_dir = path self.verbose = verbose vprint(self.verbose, "Version = " + self.version)
def predict(self, Xtest, num_predicted_frames=8): ''' Make predictions of the next num_predicted_frames frames. For this example we predict persistence of the last frame.''' vprint(self.verbose, "Model :: ========= Making predictions =========") start = time.time() Ytest = np.array([Xtest[-1]] * num_predicted_frames) end = time.time() vprint(self.verbose, "[+] Success, predictions made in %5.2f sec" % (end - start)) return Ytest
def train(self, Xtrain, Ttrain=[]): ''' Adjust parameters with training data. Xtrain is a matrix of frames (frames in lines, features/variables in columns) Ttrain is the optional time index. The index may not be continuous (e.g. jumps or resets) Typically Xtrain has thousands of lines.''' vprint(self.verbose, "Model :: ========= Training model =========") start = time.time() # Do something end = time.time() vprint(self.verbose, "[+] Success, model trained in %5.2f sec" % (end - start))
def getInfo(self, filename, verbose=True): ''' Get all information {attribute = value} pairs from the filename (public.info file), if it exists, otherwise, output default values''' #print "os.path.exist" #print os.path.exists(filename) if filename == None: basename = self.basename input_dir = self.input_dir else: basename = os.path.basename(filename).split('_')[0] input_dir = os.path.dirname(filename) if os.path.exists(filename): self.getInfoFromFile(filename) vprint(verbose, "Info file found : " + os.path.abspath(filename)) # Finds the data format ('dense', 'sparse', or 'sparse_binary') self.getFormatData( os.path.join(input_dir, basename + '_train1.data')) #else: # self.info={} else: vprint(verbose, "Info file NOT found : " + os.path.abspath(filename)) # Hopefully this never happens because this is done in a very inefficient way # reading the data multiple times... self.info['usage'] = 'No Info File' self.info['name'] = basename # Get the data format and sparsity self.getFormatData( os.path.join(input_dir, basename + '_train1.data')) # Assume no categorical variable and no missing value (we'll deal with that later) self.info['has_categorical'] = 0 self.info['has_missing'] = 0 # Get the target number, label number, target type and task self.getTypeProblem( os.path.join(input_dir, basename + '_train1.solution')) if self.info['task'] == 'regression': self.info['metric'] = 'r2_metric' else: self.info['metric'] = 'auc_metric' # Feature type: Numerical, Categorical, or Binary # Can also be determined from [filename].type self.info['feat_type'] = 'Mixed' # Get the number of features and patterns self.getNbrFeatures( os.path.join(input_dir, basename + '_train.data'), os.path.join(input_dir, basename + '_test.data'), os.path.join(input_dir, basename + '_valid.data')) self.getNbrPatterns(basename, input_dir, 'train') self.getNbrPatterns(basename, input_dir, 'valid') self.getNbrPatterns(basename, input_dir, 'test') # Set default time budget self.info['time_budget'] = 600 return self.info
def adapt(self, Xadapt, Tadapt=[]): ''' Adjust parameters and hyper-paramaters with short-term adaptation data. Xadapt is a matrix of frames (frames in lines, features/variables in columns) Tadapt is the optional time index. Typically the time index has no cuts/jumps and the number of frames is of the order of 100.''' vprint(self.verbose, "Model :: ========= Adapting model =========") start = time.time() # Do something end = time.time() vprint(self.verbose, "[+] Success, model adapted in %5.2f sec" % (end - start))
def predict(self, Xtest, num_predicted_frames=8, ycol0=0): ''' Make predictions of the next num_predicted_frames frames. Start at variable ycol0 only (do not predict the values of the first 0 to ycol0-1 variables). For this example we predict persistence of the last frame.''' vprint(self.verbose, "Model :: ========= Making predictions =========") start = time.time() Ytest = np.array([Xtest[-1, ycol0:]] * num_predicted_frames) end = time.time() vprint(self.verbose, "[+] Success, predictions made in %5.2f sec" % (end - start)) return Ytest
def reloadData(self, filename, format="pickle"): ''' Reload data in pickle or csv format. Warning: csv format will not reload medatada, suitable only for predictions. ''' if not filename.endswith(format): filename = filename + '.' + format vprint(self.verbose, "DataManager :: ========= Attempting to reload data from " + filename) start = time.time() success = True temp =[] try: if format=='pickle': with open(filename, 'rb') as f: temp = pickle.load(f) for key in self.__dict__.keys(): self.__dict__[key] = temp[key] elif format=='csv' and self.datatype=='output': data = np.genfromtxt(filename, delimiter=',', skip_header=1) self.t = data[:,0] self.X = data[:,1:] else: vprint(self.verbose, "[-] Wrong file format " + format + " for " + self.datatype) success = False except Exception as e: vprint (self.verbose, e) success = False if success: end = time.time() vprint(self.verbose, "[+] Success in %5.2f sec" % (end - start)) self.resetTime() return success
def appendData(self, X, t): ''' Append a data sample (useful for predictions). ''' vprint(self.verbose, "DataManager :: ========= Appending {} frame(s)".format(X.shape[0])) if X.shape[0] != t.shape[0]: vprint(self.verbose, "[-] Inconsistent dimensions X.len={} t.len={}".format(X.shape[0], t.shape[0])) self.t = np.append(self.t, t) if self.datatype=='output': rng=range(self.ycol0,X.shape[1]) else: rng=range(X.shape[1]) if self.X.shape[0]==0: self.X = X[:,rng] else: self.X = np.append(self.X[:,rng], X[:,rng], axis=0) return
def __init__(self, datatype="unknown", data_file="", verbose=False, max_samples=float('inf'), cache_file=""): '''Constructor''' self.version = "1" self.datatype = datatype self.verbose = verbose self.max_samples=max_samples self.cache_file=cache_file # To save/reload data in binary format (only if not empty) if not cache_file: self.use_pickle = False else: self.use_pickle = True self.X = np.array([]) self.t = np.array([]) vprint(self.verbose, "Data Manager :: Version = " + self.version) if data_file: self.loadData(data_file)
def loadData(self, filename, feat_type, verbose=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")): with open( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint( verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info: self.getFormatData(filename) if 'feat_num' not in self.info: self.getNbrFeatures(filename) data_func = { 'dense': input_routines.convert_file_to_array, 'sparse': data_io.data_sparse, 'sparse_binary': data_io.data_binary_sparse } data = data_func[self.info['format']](filename, feat_type) if self.use_pickle: with open( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint( verbose, "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print("[+] Success in %5.2f sec" % (end - start)) return data
def getInfo(self, filename, verbose=True): ''' Get all information {attribute = value} pairs from the filename (public.info file), if it exists, otherwise, output default values''' if filename == None: basename = self.basename input_dir = self.input_dir else: basename = os.path.basename(filename).split('_')[0] input_dir = os.path.dirname(filename) if os.path.exists(filename): self.getInfoFromFile(filename) vprint(verbose, "Info file found : " + os.path.abspath(filename)) # Finds the data format ('dense', 'sparse', or 'sparse_binary') self.getFormatData(os.path.join(input_dir, basename + '_train.data')) else: vprint(verbose, "Info file NOT found : " + os.path.abspath(filename)) # Hopefully this never happens because this is done in a very inefficient way # reading the data multiple times... self.info['usage'] = 'No Info File' self.info['name'] = basename # Get the data format and sparsity self.getFormatData(os.path.join(input_dir, basename + '_train.data')) # Assume no categorical variable and no missing value (we'll deal with that later) self.info['has_categorical'] = 0 self.info['has_missing'] = 0 # Get the target number, label number, target type and task self.getTypeProblem(os.path.join(input_dir, basename + '_train.solution')) if self.info['task'] == 'regression': self.info['metric'] = 'r2_metric' else: self.info['metric'] = 'auc_metric' # Feature type: Numerical, Categorical, or Binary # Can also be determined from [filename].type self.info['feat_type'] = 'Mixed' # Get the number of features and patterns self.getNbrFeatures(os.path.join(input_dir, basename + '_train.data'), os.path.join(input_dir, basename + '_test.data'), os.path.join(input_dir, basename + '_valid.data')) self.getNbrPatterns(basename, input_dir, 'train') self.getNbrPatterns(basename, input_dir, 'valid') self.getNbrPatterns(basename, input_dir, 'test') # Set default time budget self.info['time_budget'] = 600 return self.info
def loadData (self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse} data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)): vprint (verbose, "Replace missing values by 0 (slow, sorry)") data = data_converter.replace_missing(data) if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return data
def loadTrainData(self, data_dir="", max_samples=float('inf')): ''' Get the data from hdf5 files.''' success = True data_reloaded = False vprint(self.verbose, "Data Manager :: ========= Reading training data from " + data_dir) start = time.time() vid=0 if self.use_pickle and self.reloadData(self.cache_file): # Try to reload the file from a pickle data_reloaded = True # Turn "success" to false if there is a problem. else: # Load the data into X and t. dir_list = [] for dir in os.listdir(data_dir): if os.path.isdir(os.path.join(data_dir, dir)): dir_list.append(dir) # sort dir in decreasing order of n for n in Xmn dir_list = sorted(dir_list, key=lambda i: i.split('m')[-1], reverse=True) vprint(self.verbose, dir_list) self.X=np.array([]) # Re-initialize from scratch self.t=np.array([]) for dir in dir_list: for data_file in sorted([h5file for h5file in os.listdir(os.path.join(data_dir, dir)) if h5file.endswith('h5')],key=lambda i:int(i.split('.')[0].split('X')[-1])): self.appendSamples(data_file, os.path.join(data_dir, dir), verbose=False) vid=vid+1 #self.X = np.reshape(self.X, (-1, self.X[0].shape[-2],self.X[0].shape[-1])) if self.use_pickle and not data_reloaded: # Save data as a pickle for "faster" later reload self.saveData(self.cache_file, format='pickle') end = time.time() if len(self.X)==0: success = False vprint(self.verbose, "[-] Loading failed") else: vprint(self.verbose, "[+] Success, loaded %d videos in %5.2f sec" % (vid, end - start)) #vprint(self.verbose, self.X.shape) #vprint(self.verbose, self.t.shape) return success
def saveData(self, data_file, data_dir="", frames=[], format='pickle'): ''' Save data in picke / h5 format. Parameters: data_file: save data under this filename (no extention) data_dir: where to save data frames: specify which lines in the video matrix to be saved, e.g. frames=(start_frame, end_frame)=(10,15) default = entire video matrix format: 'pickle' or 'h5', default = 'pickle' ''' if not data_file.endswith(format): data_file = data_file + '.' + format success = True try: filename = os.path.join(data_dir, data_file) vprint(self.verbose, "Data Manager :: ========= Saving data to " + filename) start = time.time() # Write some code to save the data if frames: if format=='h5': with h5py.File(filename, 'w') as f: f.create_dataset(name='X', shape=self.X[frames[0]:frames[1]].shape, \ data=self.X[frames[0]:frames[1]]) f.create_dataset(name='t', shape=self.t[frames[0]:frames[1]].shape, \ data=self.t[frames[0]:frames[1]]) else: with open(filename, 'wb') as f: dict_to_save = {key:self.__dict__[key] for key in self.__dict__.keys() if not key in ['X', 't']} dict_to_save['X'] = self.X[frames[0]:frames[1]] dict_to_save['t'] = self.t[frames[0]:frames[1]] pickle.dump(dict_to_save, f, 2) else: #save the entire matrix if format=='h5': with h5py.File(filename, 'w') as f: f.create_dataset(name='X', shape=self.__dict__['X'].shape, data=self.__dict__['X']) f.create_dataset(name='t', shape=self.__dict__['t'].shape, data=self.__dict__['t']) else: with open(filename, 'wb') as f: pickle.dump(self.__dict__, f, 2) except Exception as e: vprint (e) success = False end = time.time() vprint(self.verbose, "[+] Success in %5.2f sec" % (end - start)) return success
def loadData(self, data_file, data_dir=""): ''' Erase previous data and load data from a give data file. data_file: Number n of the 'chunk' or 'step' (appearing in the file name) Alternatively, the full file name Xn can be supplied as a string instead of the chunk number. ''' success = True start = time.time() if isinstance(data_file, int ): data_file = "X" + str(data_file) vprint(self.verbose, "Data Manager :: ========= Loading data from " + data_file) self.X, self.t = self.getOneSample(data_file, data_dir) end = time.time() if len(self.X)==0: success = False vprint(self.verbose, "[-] Loading failed") else: vprint(self.verbose, "[+] Success in %5.2f sec" % (end - start)) return success
#### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames)>0: vprint( verbose, "************************************************************************") vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******") vprint( verbose, "************************************************************************") OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if OK: vprint( verbose, "[+] Success") datanames = [] # Do not proceed with learning and testing else: vprint( verbose, "======== Some missing results on current datasets!") vprint( verbose, "======== Proceeding to train/test:\n") # =================== End @RESULT SUBMISSION (KEEP THIS) ================== # ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) ================= overall_time_budget = 0 for basename in datanames: # Loop over datasets vprint( verbose, "************************************************")
#### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames)>0: vprint( verbose, "************************************************************************") vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******") vprint( verbose, "************************************************************************") OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if OK: vprint( verbose, "[+] Success") datanames = [] # Do not proceed with learning and testing else: vprint( verbose, "======== Some missing results on current datasets!") vprint( verbose, "======== Proceeding to train/test:\n") # =================== End @RESULT SUBMISSION (KEEP THIS) ================== # ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) ================= overall_time_budget = 0 for basename in datanames: # Loop over datasets
def saveData(self, filename, format="pickle"): ''' Save data in pickle format or csv formal. ''' if not filename.endswith(format): filename = filename + '.' + format vprint(self.verbose, "[-] filename must end with " + format) vprint(self.verbose, "DataManager :: ========= Saving data to " + filename) start = time.time() try: if format == 'pickle': with open(filename, 'wb') as f: vprint(self.verbose, "DataManager :: Saving as pickle") dict_to_save = { key: self.__dict__[key] for key in self.__dict__.keys() if not key in ['X', 't'] } dict_to_save['X'] = self.X dict_to_save['t'] = self.t pickle.dump(dict_to_save, f, 2) else: with open(filename, 'w') as f: vprint(self.verbose, "DataManager :: Saving as csv") f.write("Date") for nm in self.col_names: f.write("," + nm) f.write("\n") for i in range(self.t.shape[0]): f.write("{:g}".format(self.t[i])) for j in range(self.X.shape[1]): f.write(",{:g}".format(self.X[i, j])) f.write("\n") success = True except Exception as e: vprint(self.verbose, e) success = False end = time.time() vprint(self.verbose, "[+] Success in %5.2f sec" % (end - start)) return success
output_dir = os.path.abspath(argv[2]); #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing for basename in datanames: # Loop over datasets if basename not in ["robert"]: continue vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "======== Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, verbose=verbose) print D # ======== Keeping track of time time_spent = time.time() - start vprint( verbose, "time spent %5.2f sec" %time_spent)
def loadData(self, data_dir=""): ''' Get the data from csv files.''' success = True data_reloaded = False vprint( self.verbose, "DataManager :: ========= Reading training data from " + data_dir) start = time.time() if self.use_pickle and self.reloadData(self.cache_file): # Try to reload the file from a pickle data_reloaded = True # Turn "success" to false if there is a problem. else: # Load metadata metadata = yaml.load(open(join(data_dir, 'metadata'), 'r')) self.stride = metadata['stride'] self.horizon = metadata['horizon'] self.ycol0 = metadata['ycol0'] # Load the training data data into X and t. data_file_list = sorted(ls(join(data_dir, "training", "*.csv"))) vprint(self.verbose, "DataManager :: ========= Load data from files:") vprint(self.verbose, data_file_list) header = np.genfromtxt(data_file_list[0], delimiter=',', max_rows=1, names=True) self.col_names = header.dtype.names[1:] for data_file in data_file_list: data = np.genfromtxt(data_file, delimiter=',', skip_header=1) self.t = np.append(self.t, data[:, 0]) if self.X.shape[0] == 0: self.X = data[:, 1:] else: self.X = np.append(self.X, data[:, 1:], axis=0) self.t0 = self.t.shape[0] # Append the evaluation data data to X and t. data_file_list = sorted(ls(join(data_dir, "evaluation", "*.csv"))) vprint(self.verbose, data_file_list) for data_file in data_file_list: data = np.genfromtxt(data_file, delimiter=',', skip_header=1) self.t = np.append(self.t, data[:, 0]) self.X = np.append(self.X, data[:, 1:], axis=0) if self.use_pickle and not data_reloaded: # Save data as a pickle for "faster" later reload self.saveData(self.cache_file) end = time.time() if len(self.X) == 0: success = False vprint(self.verbose, "[-] Loading failed") else: vprint( self.verbose, "[+] Success, loaded %d samples in %5.2f sec" % (self.t.shape[0], end - start)) self.resetTime() return success
# Overwrite the "natural" order #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_version() data_io.show_io(input_dir, output_dir) print('\n****** Ingestion program version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing #### MAIN LOOP OVER DATASETS: overall_time_budget = 0 time_left_over = 0 for basename in datanames: # Loop over datasets vprint( verbose, "\n========== Ingestion program version " + str(version) + " ==========\n") vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print(D) vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time
datanames = data_io.inventory_data(input_dir) # Overwrite the "natural" order # DEBUG MODE: Show dataset list and STOP if debug_mode >= 3: data_io.show_version() data_io.show_io(input_dir, output_dir) print('\n****** Ingestion program version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # MAIN LOOP OVER DATASETS: overall_time_budget = 0 time_left_over = 0 for basename in datanames: # Loop over datasets vprint(verbose, "\n========== Ingestion program version " + str(version) + " ==========\n") vprint(verbose, "************************************************") vprint(verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint(verbose, "************************************************") tmp_valid = os.path.join(program_dir, 'output', basename + '_valid.predict') if os.path.isfile(tmp_valid): os.link(tmp_valid, os.path.join(output_dir, basename + '_valid.predict')) tmp_test = os.path.join(program_dir, 'output', basename + '_test.predict') if os.path.isfile(tmp_test): os.link(tmp_test, os.path.join(output_dir, basename + '_test.predict')) vprint(verbose, "[+] Results saved using cache") continue # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time()
# Overwrite the "natural" order #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_version() data_io.show_io(input_dir, output_dir) print('\n****** Ingestion program version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing #### MAIN LOOP OVER DATASETS: overall_time_budget = 0 time_left_over = 0 for basename in datanames: # Loop over datasets vprint( verbose, "\n========== Ingestion program version " + str(version) + " ==========\n") vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print (D) vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time
datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode >= 3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames) > 0: vprint( verbose, "************************************************************************" ) vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******" ) vprint( verbose, "************************************************************************" ) OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if OK: vprint(verbose, "[+] Success") datanames = [] # Do not proceed with learning and testing else:
#### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) ================= overall_time_budget = 0 time_left_over = 0 for basename in datanames: vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + basename.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "========= Reading and converting data ==========") D = DataManager(basename, input_dir, replace_missing=True, filter_features=True, max_samples=max_samples, verbose=verbose) print D vprint( verbose, "[+] Size of uploaded data %5.2f bytes" % data_io.total_size(D)) # ======== Keeping track of time if debug_mode<1:
def save(self, path=""): ''' Save model.''' if not path: path = self.model_dir vprint(self.verbose, "Model :: ========= Saving model to " + path) pickle.dump(self, open(os.path.join(path, '_model.pickle'), "w"))
path.append (program_dir) path.append (submission_dir) import data_io # general purpose input/output functions from data_io import vprint # print only in verbose mode from data_manager import DataManager # load/save data and get info about them from model import Model # example model, in scikit-learn style if debug_mode >= 4: # Show library version and directory structure data_io.show_dir(".") # Move old results and create a new output directory (useful if you run locally) if save_previous_results: data_io.mvdir(output_dir, output_dir+'_'+the_date) data_io.mkdir(output_dir) vprint( verbose, "\n========== Ingestion program version " + str(version) + " ==========\n") #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_version() data_io.show_io(input_dir, output_dir) exit(0) vprint( verbose, "****************************************************") vprint( verbose, "******** Processing spatio-temporal dataset ********") vprint( verbose, "****************************************************") #### Instanciate input data manager and load data vprint( verbose, "========= Reading and converting data ==========") Din = DataManager(datatype="input", verbose=verbose) Din.loadData(input_dir) vprint( verbose, Din)
def predict(self, Xtest, num_predicted_frames=8, ycol0=0): ''' Make predictions of the next num_predicted_frames frames. Start at variable ycol0 only (do not predict the values of the first 0 to ycol0-1 variables). For this example we predict persistence of the last frame.''' vprint(self.verbose, "Model :: ========= Making predictions =========") vprint(self.verbose, "===============================================") start = time.time() #Ytest = np.array([Xtest[random.randint(0,10),ycol0:]] * num_predicted_frames) ###################### # import rpy2's package module import rpy2 import rpy2.robjects as robjects import rpy2.robjects.packages as rpackages from rpy2.robjects.packages import importr # import R's "base" package base = rpackages.importr('base') # import R's utility package utils = rpackages.importr('utils') # select a mirror for R packages utils.chooseCRANmirror(ind=1) # select the first mirror in the list if rpy2.robjects.packages.isinstalled( 'forecast', lib_loc=rpy2.__path__[0]) == False: utils.install_packages('forecast', lib=rpy2.__path__[0]) forecast = importr('forecast', lib_loc=rpy2.__path__[0]) ts = robjects.r('ts') #from rpy2.robjects.vectors import FloatVector #from rpy2.robjects.vectors import IntVector #from rpy2.robjects.vectors import BoolVector #from rpy2.robjects import pandas2ri from rpy2.robjects import pandas2ri from rpy2.robjects import vectors pandas2ri.activate() ###################### Ytest = np.zeros((7, 57)) # Code assumes daily data (not aggregated. Arima will break if it's run on aggregated data. # I've provided commented code that should undo aggrgation in inputs into model and redo # aggregation to return the predictions (Ytest) # undo aggregation: future_starts = [] for col in range(ycol0, Xtest.shape[1]): init = Xtest[0, col] for row in range(1, Xtest.shape[0]): Xtest[row, col] -= init init += Xtest[row, col] future_starts.append(init) for col in range(ycol0, Xtest.shape[1]): #print(col) dtp = num_predicted_frames - 1 # days to predict ndpat = num_predicted_frames # number days to predict at a time dat = Xtest[1:, col] #print(dat) #print(len(dat)) sum_RMSE = 0 f = ts(dat, frequency=1, start=1, end=len(dat)) best_params = robjects.IntVector([0, 0, 0]) best_RMSE = 1000000 for p in range(1, 5): for q in range(0, 5): for d in range(0, 3): try: t_order = robjects.IntVector([p, d, q]) fit2 = forecast.Arima(f, order=t_order, xreg=robjects.r("NULL"), include_mean=True, include_drift=False, biasadj=False, method="ML", model=robjects.r("NULL")) RMSE = forecast.accuracy(fit2)[0][2] #RMSE if RMSE < best_RMSE: best_RMSE = RMSE best_params = robjects.IntVector([p, d, q]) except: continue best_opts = robjects.BoolVector([True, False]) possible_opts = robjects.BoolVector([True, False]) for mean_opt in range(0, 1): for drift_opt in range(0, 1): mean_opt = possible_opts[mean_opt] drift_opt = possible_opts[drift_opt] fit2 = forecast.Arima(f, order=best_params, xreg=robjects.r("NULL"), include_mean=mean_opt, include_drift=drift_opt, biasadj=False, method="ML", model=robjects.r("NULL")) RMSE = forecast.accuracy(fit2)[0][2] #RMSE if (RMSE < best_RMSE): #print(paste("Reset best_params to (p,d,q) = (", p, ",", d, ",", q , ")", sep = "")) best_RMSE = RMSE best_opts = robjects.BoolVector([mean_opt, drift_opt]) #print("best params = ", best_params) #print("best opts = ", best_opts) fit2 = forecast.Arima(f, order=best_params, xreg=robjects.r("NULL"), include_mean=best_opts[0], include_drift=best_opts[1], biasadj=False, method="ML", model=robjects.r("NULL")) # print(forecast.forecast(fit2, ndpat)) # print(forecast.forecast(fit2, ndpat)[0]) # print(forecast.forecast(fit2, ndpat)[1]) # print(forecast.forecast(fit2, ndpat)[2]) # print(forecast.forecast(fit2, ndpat)[3]) Ytest[:, col] = forecast.forecast(fit2, ndpat)[3] #print(Ytest) #print(Xtest.shape) # (78, 57) #print(Xtest.shape[0]) # 78 #print(Ytest.shape) # typically (7, 57) # reconstruct aggregated predictions for col in range(ycol0, Xtest.shape[1]): init = future_starts[col] for row in range(0, num_predicted_frames - 1): tinc = init init += Ytest[row, col] Ytest[row, col] += tinc end = time.time() vprint(self.verbose, "[+] Success, predictions made in %5.2f sec" % (end - start)) vprint(self.verbose, "Model :: ======== Predictions finished ========") return Ytest
#### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_version() data_io.show_io(input_dir, output_dir) print('\n****** Ingestion program version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing #### MAIN LOOP OVER DATASETS: overall_time_budget = 0 time_left_over = 0 vprint( verbose, "\n========== Ingestion program version " + str(version) + " ==========\n") vprint( verbose, "************************************************") vprint( verbose, "******** Processing dataset " + data_name.capitalize() + " ********") vprint( verbose, "************************************************") # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint( verbose, "========= Reading and converting data ==========") #Read DATA data = read_images(input_dir, numerical_labels=True)
#### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) # Overwrite the "natural" order #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames)>0: vprint( verbose, "************************************************************************") vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******") vprint( verbose, "************************************************************************") datanames = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if not datanames: vprint( verbose, "[+] Success") else: vprint( verbose, "======== Some missing results on current datasets!") vprint( verbose, "======== Proceeding to train/test:\n") # =================== End @RESULT SUBMISSION (KEEP THIS) ================== # ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) ================= overall_time_budget = 0 time_left_over = 0 for basename in datanames: # Loop over datasets
# =========================== BEGIN PROGRAM ================================ if __name__=="__main__" and debug_mode<4: #### Check whether everything went well (no time exceeded) execution_success = True #### INPUT/OUTPUT: Get input and output directory names if len(argv)==1: # Use the default input and output directories if no arguments are provided input_dir = default_input_dir output_dir = default_output_dir else: input_dir = argv[1] output_dir = os.path.abspath(argv[2]); vprint( verbose, "Using input_dir: " + input_dir) vprint( verbose, "Using output_dir: " + output_dir) # Move old results and create a new output directory if not(running_on_codalab) and save_previous_results: data_io.mvdir(output_dir, output_dir+'_'+the_date) data_io.mkdir(output_dir) #### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) # Overwrite the "natural" order #### DEBUG MODE: Show dataset list and STOP if debug_mode>=3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n')
data_io.show_io(input_dir, output_dir) print('\n****** Ingestion program version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing #### MAIN LOOP OVER DATASETS: overall_time_budget = 0 time_left_over = 0 #for basename in datanames: # Loop over datasets ######################################################### CLASSIFICATION ######################################################################################### basename = datanames[0] vprint( verbose, "************************************************************************" ) vprint( verbose, "******** Processing dataset " + basename.capitalize() + " for patch classification ********") vprint( verbose, "************************************************************************" ) # ======== Learning on a time budget: # Keep track of time not to exceed your time budget. Time spent to inventory data neglected. start = time.time() # ======== Creating a data object with data, informations about it vprint(verbose, "========= Reading and converting data ==========")