def loadData (self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, sparse_binary''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse} data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)): vprint (verbose, "Replace missing values by 0 (slow, sorry)") data = data_converter.replace_missing(data) if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return data
def loadData (self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse} data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)): vprint (verbose, "Replace missing values by 0 (slow, sorry)") data = data_converter.replace_missing(data) if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return data
def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) # only get numerical variables X = F['numerical'] # convert NaN to zeros X = data_converter.replace_missing(X) self.num_train_samples = X.shape[0] self.num_feat = X.shape[1] num_train_samples = y.shape[0] self.DataX = X self.DataY = y logging.info("The whole available data is: ") logging.info( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0], self.DataX.shape[1])) logging.info( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels)) X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10) if (self.num_train_samples != num_train_samples): logging.info("ARRGH: number of samples in X and y do not match!") self.is_trained = True
def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' # get the raw categorical multivalued variables in case you want to process them, in this baseline we simply ignore them MV = F['MV'] CAT = F['CAT'] overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] print("[***] Overall time spent %5.2f sec" % overall_spenttime) print("[***] Dataset time spent %5.2f sec" % dataset_spenttime) # only get numerical variables X = F['numerical'] # get numerical variables, concatenate them with categorical variables # catnumeric_dataset=np.array(CAT) # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C') # convert NaN to zeros X = data_converter.replace_missing(X) #imputer = SimpleImputer(missing_values='NaN', strategy='constant', fill_value=0) #X = imputer.transform(X) num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] print(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): print( "ARRGH: number of features in X does not match training data!") print(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict(X) print("Y", y) np.errstate(divide='ignore', invalid='ignore') y = np.transpose(y) return y
def predict(self, F, data_info, time_info): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually can return probabilities or continuous values. ''' self.batch_num += 1 self.predict_start_time = time.time() self.module_time['read_data'][self.batch_num] = ( self.predict_start_time - self.fit_end_time) module_start = time.time() F['numerical'] = self.missing_value_preprocess.transform( F['numerical'], 'numerical', input_type='ndarray') F['CAT'] = self.missing_value_preprocess.transform( F['CAT'], 'CAT', input_type='dataframe') if self.use_mv: F['MV'] = self.missing_value_preprocess.transform( F['MV'], 'MV', input_type='dataframe') F['numerical'] = data_converter.replace_missing( F['numerical']).astype('float32') F['CAT'] = F['CAT'].fillna('-1') module_end = time.time() self.module_time['preprocess'][ self.batch_num] = module_end - module_start self.F = F self.y_pred = self.transferPredict(0, data_info, time_info) self.predict_end_time = time.time() return self.y_pred
def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) # only get numerical variables X = F['numerical'] # convert NaN to zeros X = data_converter.replace_missing(X) num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] logging.info( ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): logging.info( "ARRGH: number of features in X does not match training data!") logging.info( ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict_proba(X)[:, 1] y = np.transpose(y) return y
def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' # get the raw categorical and categorical multivalued variables in case you want to process them, in this baseline we simply ignore them MV = F['MV'] CAT = F['CAT'] # only get numerical variables X = F['numerical'] overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] print("[***] Overall time spent %5.2f sec" % overall_spenttime) print("[***] Dataset time spent %5.2f sec" % dataset_spenttime) # get numerical variables, concatenate them with categorical variables # catnumeric_dataset=np.array(CAT) # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C') # convert NaN to zeros X = data_converter.replace_missing(X) #print "This batch of data has: " self.num_train_samples = X.shape[0] if X.ndim > 1: self.num_feat = X.shape[1] #print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat) num_train_samples = y.shape[0] if y.ndim > 1: self.num_labels = y.shape[1] #print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels) # subsample the data for efficient processing removeperc = 0.9 if removeperc > 0: rem_samples = int(num_train_samples * removeperc) skip = sorted( random.sample(range(num_train_samples), num_train_samples - rem_samples)) num_train_samples = num_train_samples - rem_samples X = X[skip, :] y = y[skip, :] self.num_train_samples = X.shape[0] if self.is_trained: _ = self.clf.set_params(n_estimators=self.clf.n_estimators + 1, warm_start=True) self.DataX = X self.DataY = y else: self.DataX = X self.DataY = y print("The whole available data is: ") print( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0], self.DataX.shape[1])) print( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels)) #print "fitting with ..." #print self.clf.n_estimators self.clf.fit(self.DataX, np.ravel(self.DataY)) #print "Model fitted.." if (self.num_train_samples != num_train_samples): print("ARRGH: number of samples in X and y do not match!") self.is_trained = True
def fit(self, F, y, data_info, time_info): # time budget score if not self.is_trained: weight = [1., 1., 2., 4.] feat_weight = 0.0 for i in range(4): feat_weight += weight[i] * data_info['loaded_feat_types'][i] self.budget_score = float(data_info['time_budget']) / ( y.shape[0] * 10. / 1e6) / feat_weight self.time_manager = TimeManager(self.budget_score, self.batch_window_size) print('budget score: %.2f' % (self.budget_score)) self.use_mv = False if (data_info['loaded_feat_types'][3] != 0 and self.budget_score > 3.5): self.use_mv = True # read data if not self.is_trained: self.module_time['read_data'][ self.batch_num] = time.time() - time_info[1] y = y.ravel() # preprocessing if not self.is_trained: module_start = time.time() F['numerical'] = self.missing_value_preprocess.fit_transform( F['numerical'], 'numerical', input_type='ndarray') F['CAT'] = self.missing_value_preprocess.fit_transform( F['CAT'], 'CAT', input_type='dataframe') if self.use_mv: F['MV'] = self.missing_value_preprocess.fit_transform( F['MV'], 'MV', input_type='dataframe') F['numerical'] = data_converter.replace_missing( F['numerical']).astype('float32') F['CAT'] = F['CAT'].fillna('-1') module_end = time.time() self.module_time['preprocess'][ self.batch_num] = module_end - module_start self.F = F # store current batch of data if (len(self.data_memory) == self.batch_window_size): del self.data_memory[0] self.data_memory.append([self.F, y]) self.batch_end_time = time.time() if self.is_trained: self.batch_start_time = self.next_batch_start_time else: self.batch_start_time = time_info[1] self.overall_time[ self.batch_num] = self.batch_end_time - self.batch_start_time print('overall time spent on batch %d: %.2f seconds' % (self.batch_num, self.overall_time[self.batch_num])) for m in module: t = self.module_time[m][self.batch_num] ratio = t / self.overall_time[self.batch_num] print('%s: %.2f seconds, %.2f%%' % (m, t, ratio * 100.)) if self.is_trained: print('time spent ratio: %.2f%%' % (self.time_spent_ratio)) self.fit_end_time = time.time() self.next_batch_start_time = time.time()
def loadDataMV(self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, sparse_binary''' # data.MV=[] # data.X=[] if verbose: print("========= Reading " + filename) ntime = nnum = ncat = nmvc = 0 start = time.time() # find the type of features for the data set dictfeats = self.feat_type usetime = np.array(np.where(self.feat_type == 'Time'))[0] usenum = np.array(np.where(self.feat_type == 'Numerical'))[0] usecat = np.array(np.where(self.feat_type == 'Categorical'))[0] usemulticat = np.array(np.where(self.feat_type == 'Multi-value'))[0] if verbose: print("=== Detected %d Numerical Features" % len(usenum)) print("=== Detected %d Categorical Features" % len(usecat)) print("=== Detected %d Multi-valued Categorical Features" % len(usemulticat)) print("=== Detected %d Time Features" % len(usetime)) # artificial headers for features for i in range(len(dictfeats)): dictfeats[i] = str(i) # read the first column to identify the total number of features df = pd.read_csv(filename, header=None, names=dictfeats, delim_whitespace=True, usecols=[0], parse_dates=True, na_values='NaN') n_samples = len(df.index) if verbose: print("=== %d Samples will be loaded " % len(df.index)) concadat = np.zeros((n_samples, 1)) del df # Check the available types of features if verbose: print("========================") if usetime != []: if verbose: print("=== Processing %d Time features " % len(usetime)) try: dftime = pd.read_csv(filename, header=None, names=self.feat_type[usetime], usecols=usetime, delim_whitespace=True, parse_dates=True, na_values='NaN') ddt = np.array(dftime) ntime = ddt.shape[1] concadat = np.concatenate((concadat, ddt), axis=1) del dftime del ddt except: print("Failed to load time variables") if usenum != []: if verbose: print("=== Processing %d Numerical features " % len(usenum)) try: dfnum = pd.read_csv(filename, header=None, names=self.feat_type[usenum], usecols=usenum, delim_whitespace=True, na_values='NaN') dd = np.array(dfnum) nnum = dd.shape[1] concadat = np.concatenate((concadat, dd), axis=1) del dfnum del dd except: print("Failed to load numerical variables") if usecat != []: # categorical features will be loaded as numbers for efficiency if verbose: print("=== Processing %d Categorical features " % len(usecat)) try: dfcat = pd.read_csv(filename, header=None, names=self.feat_type[usecat], usecols=usecat, dtype=object, delim_whitespace=True, na_values='NaN') ncat = dfcat.shape[1] CAT = dfcat # Treat categorical variables as integers or perform hash encoding (one hot encoding is far more expensive) # catnumeric_dataset=np.array(dfcat) #print("Tipo catego") #print (catnumeric_dataset.dtype) #enca = OrdinalEncoder().fit(dfcat) #catnumeric_dataset = enca.transform(dfcat) #catnumeric_dataset = np.array(catnumeric_dataset) # ncat = catnumeric_dataset.shape[1] # concadat= np.concatenate((concadat,catnumeric_dataset),axis=1) # print (catnumeric_dataset) # #np.savetxt('categ.csv',catnumeric_dataset,delimiter=',') del dfcat # del catnumeric_dataset except: print("Failed to load Categorical variables") CAT = [] else: CAT = [] if len(usemulticat) > 0: if verbose: print("=== Processing %d Multi Valued Categorical features " % len(usemulticat)) try: dfmvc = pd.read_csv(filename, header=None, names=self.feat_type[usemulticat], usecols=usemulticat, dtype=object, delim_whitespace=True, na_values='NaN') nmvc = dfmvc.shape[1] MV = dfmvc del dfmvc except: print("Failed to load Multi-Valued Categorical variables") MV = [] else: MV = [] concadat = np.delete(concadat, 0, 1) self.info['loaded_feat_types'] = [ntime, nnum, ncat, nmvc] if self.use_pickle and os.path.exists( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")): with open( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint( verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) print("not in self") if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) dataX = concadat.astype(np.float64).copy(order='C') # IMPORTANT: when we replace missing values we double the number of variables if self.info['format'] == 'dense' and replace_missing and np.any( map(np.isnan, dataX)): vprint(verbose, "Replace missing values by 0 (slow, sorry)") dataX = data_converter.replace_missing(dataX) if self.use_pickle: with open( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint( verbose, "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(dataX) end = time.time() if verbose: print("Loaded %d Samples and %d Features" % (dataX.shape[0], dataX.shape[1])) print("[+] Success in %5.2f sec" % (end - start)) data = {} data['numerical'] = dataX data['MV'] = MV data['CAT'] = CAT return data
def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' # get the raw categorical and categorical multivalued variables in case you want to process them, in this baseline we simply ignore them MV = F['MV'] CAT = F['CAT'] # only get numerical variables X = F['numerical'] overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] print("[***] Overall time spent %5.2f sec" % overall_spenttime) print("[***] Dataset time spent %5.2f sec" % dataset_spenttime) # get numerical variables, concatenate them with categorical variables # catnumeric_dataset=np.array(CAT) # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C') # convert NaN to zeros X = data_converter.replace_missing(X) XDash = X yDash = y #imputer = SimpleImputer(missing_values='NaN', strategy='constant', fill_value=0) #X= imputer.transform(X) #print "This batch of data has: " self.num_train_samples = X.shape[0] if X.ndim > 1: self.num_feat = X.shape[1] #print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat) num_train_samples = y.shape[0] if y.ndim > 1: self.num_labels = y.shape[1] #print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels) # subsample the data for efficient processing removeperc = 0.6 if removeperc > 0: rem_samples = int(num_train_samples * removeperc) skip = sorted( random.sample(range(num_train_samples), num_train_samples - rem_samples)) filteredIndex = list(set(range(0, num_train_samples)) - set(skip)) num_train_samples = num_train_samples - rem_samples X = X[skip, :] y = y[skip, :] self.num_train_samples = X.shape[0] if self.is_trained: #_ = self.clf.set_params(n_estimators=self.clf.n_estimators+1,warm_start=True); self.DataX = X self.DataY = y else: self.DataX = X self.DataY = y print("The whole available data is: ") print( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0], self.DataX.shape[1])) print( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels)) #print "fitting with ..." #print self.clf.n_estimators print("np.unique(self.DataY)", np.unique(self.DataY, return_counts=True)) if not self.is_trained: self.clf.fit(self.DataX, np.ravel(self.DataY)) else: for i in range(self.num_train_samples): print("dim(X)", self.DataX[i, :].shape) predictY = self.clf.predict(self.DataX[i, :].reshape(1, -1)) print("y,predictY:", self.DataY[i, :], predictY) print("accuracy score", accuracy_score(predictY, self.DataY[i, :])) changedetected = self.adwin2.insertInput( accuracy_score(predictY, self.DataY[i, :].reshape(1, -1))) print("Change Detected:", changedetected) if changedetected: self.clf = clone(self.clf) self.clf.partial_fit(self.DataX[i, :].reshape(1, -1), self.DataY[i, :].reshape(1, -1).ravel(), classes=np.unique(self.DataY)) else: self.clf.partial_fit( self.DataX[i, :].reshape(1, -1), self.DataY[i, :].reshape(1, -1).ravel()) #print "Model fitted.." if (self.num_train_samples != num_train_samples): print("ARRGH: number of samples in X and y do not match!") self.is_trained = True