def train_and_get_result(_df, _dft, store_item_nbrs, model, total_features): df = _df.copy() df_t = _dft.copy() RES = [] total = 0 for sno, ino in store_item_nbrs: if(sno == 35): continue res = pd.DataFrame() df1 = df[(df.store_nbr == sno) & (df.item_nbr == ino)] X_train, y_train = ut.get_train_data(df1) X_train = X_train.drop(['store_nbr', 'item_nbr'], axis=1) y_train = y_train[X_train.index.values] df2 = df_t[(df_t.store_nbr == sno) & (df_t.item_nbr == ino)] X_predict = ut.get_test_data(df2) res['date'] = X_predict['date'] res['store_nbr'] = X_predict['store_nbr'] res['item_nbr'] = X_predict['item_nbr'] X_predict = X_predict.drop(['date', 'store_nbr', 'item_nbr'], axis=1) X_train = X_train[ut.get_features()] X_predict = X_predict[ut.get_features()] regr = ut.get_regression_model(model, len(X_train.values)) regr.fit(X_train.values.astype(float), y_train.values) res['log1p'] = np.maximum(regr.predict(X_predict.values.astype(float)), 0.) RES.append(res) total += 1 print('done', total) result = pd.concat(RES) return result
def execute(data, training_data_ratio=2.0 / 3.0, k=1): """ Execute the "Locally-Weighted" Linear Regression (using Closed-Form Linear Regression) :param data: Raw Data frame parsed from CSV :param training_data_ratio: The percent (0.0 to 1.0) of input data to use in training. :param k: Smoothing parameter for local weight computation :return: Nothing """ # 2. Randomize the data randomized_data = util.randomize_data(data) # 3. Select the first 2 / 3(round up) of the data for training and the remaining for testing training_data, test_data = util.split_data(randomized_data, training_data_ratio) training_outputs = util.get_output(training_data) # 4. Standardize the data(except for the last column of course) using the training data standardized_training_data, mean, std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front standardized_training_data.insert(0, "Bias", 1) std_test_data, _, _ = util.standardize_data(util.get_features(test_data), mean, std) std_test_data.insert(0, "Bias", 1) squared_errors = [] # 5. Then for each testing sample for i in xrange(0, len(std_test_data)): testing_sample = std_test_data.iloc[i] expected_output = test_data.loc[testing_sample.name][-1] theta_query = compute_theta_query(testing_sample, standardized_training_data, training_outputs, k) # (b) Evaluate the testing sample using the local model. actual_output = np.dot(testing_sample, theta_query) # (c) Compute the squared error of the testing sample. squared_errors.append(util.compute_se(expected_output, actual_output)) # 6. Compute the root mean squared error (RMSE) sum_of_squared_errors = 0 for error in squared_errors: sum_of_squared_errors += error mean_squared_error = sum_of_squared_errors / len(squared_errors) rmse = math.sqrt(mean_squared_error) return rmse
def _evaluate (indata): if 'clustering_k' in indata: first_arg=indata['clustering_k'] elif 'clustering_merges' in indata: first_arg=indata['clustering_merges'] else: return False feats=util.get_features(indata, 'distance_') dfun=eval(indata['distance_name']) distance=dfun(feats['train'], feats['train']) cfun=eval(indata['clustering_name']) clustering=cfun(first_arg, distance) clustering.train() if 'clustering_radi' in indata: radi=max(abs(clustering.get_radiuses()-indata['clustering_radi'])) centers=max(abs(clustering.get_cluster_centers().flatten() - \ indata['clustering_centers'].flat)) return util.check_accuracy(indata['clustering_accuracy'], radi=radi, centers=centers) elif 'clustering_merge_distance' in indata: merge_distance=max(abs(clustering.get_merge_distances()- \ indata['clustering_merge_distance'])) pairs=max(abs(clustering.get_cluster_pairs()- \ indata['clustering_pairs']).flat) return util.check_accuracy(indata['clustering_accuracy'], merge_distance=merge_distance, pairs=pairs) else: return util.check_accuracy(indata['clustering_accuracy'])
def train_and_get_test(_df, store_item_nbrs, model, total_features): df = _df.copy() regrs = [] tests = [] total = 0 score_total = [] for sno, ino in store_item_nbrs: if(sno == 35): continue df1 = df[(df.store_nbr == sno) & (df.item_nbr == ino)] df_test, df_train = ut.get_random_test_and_train(df1) X_train, y_train = ut.get_train_data(df_train) X_train = X_train.drop(['store_nbr', 'item_nbr'], axis=1) y_train = y_train[X_train.index.values] X_train = X_train[ut.get_features()] regr = ut.get_regression_model(model, len(X_train)) # regr.fit(ut.get_processed_X(X_train.values), y_train.values) scores = [] scores = cross_val_score(regr, X_train.values, y_train.values, scoring="mean_squared_error", cv=10) print('done, ', total) print(-np.mean(scores)) score_total.append(-np.mean(scores)) regrs.append(regr) tests.append(df_test) total += 1 print('total_score: {}'.format(np.mean(score_total))) return regrs, tests
def _evaluate(indata): if indata.has_key('clustering_k'): first_arg = indata['clustering_k'] elif indata.has_key('clustering_merges'): first_arg = indata['clustering_merges'] else: return False feats = util.get_features(indata, 'distance_') dfun = eval(indata['distance_name']) distance = dfun(feats['train'], feats['train']) cfun = eval(indata['clustering_name']) clustering = cfun(first_arg, distance) clustering.train() if indata.has_key('clustering_radi'): radi = max(abs(clustering.get_radiuses() - indata['clustering_radi'])) centers=max(abs(clustering.get_cluster_centers().flatten() - \ indata['clustering_centers'].flat)) return util.check_accuracy(indata['clustering_accuracy'], radi=radi, centers=centers) elif indata.has_key('clustering_merge_distance'): merge_distance=max(abs(clustering.get_merge_distances()- \ indata['clustering_merge_distance'])) pairs=max(abs(clustering.get_cluster_pairs()- \ indata['clustering_pairs']).flat) return util.check_accuracy(indata['clustering_accuracy'], merge_distance=merge_distance, pairs=pairs) else: return util.check_accuracy(indata['clustering_accuracy'])
def execute(data, num_folds=5): """ Compute the Root Mean Squared Error using num_folds for cross validation :param data: Raw Data frame parsed from CSV :param num_folds: The number of folds to use :return: Root Mean Squared Error """ assert data is not None, "data must be a valid DataFrame" assert num_folds > 1, "num_folds must be greater than one." # 2. Randomizes the data randomized_data = util.randomize_data(data) # 3. Creates S folds (for our purposes S = 5, but make your code generalizable, that is it should # work for any legal value of S) folds = divide_data(randomized_data, num_folds) squared_errors = [] # 4. For i = 1 to S for i in xrange(0, num_folds): # (a) Select fold i as your testing data and the remaining (S - 1) folds as your training data test_data = folds[i] training_data = select_training_data(folds, i) # (b) Standardizes the data (except for the last column of course) based on the training data standardized_train_data, mean, std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front standardized_train_data.insert(0, "Bias", 1) # (c) Train a closed-form linear regression model training_outputs = util.get_output(training_data) weights = cflr.find_weights(standardized_train_data, training_outputs) # (d) Compute the squared error for each sample in the current testing fold expected = util.get_output(test_data) actual = cflr.apply_solution(util.get_features(test_data), mean, std, weights) squared_error = (expected - actual)**2 squared_errors.append(squared_error) # 5. Compute the RMSE using all the errors. rmse = compute_rmse(len(data), squared_errors) return rmse
def _evaluate(indata): prefix = 'classifier_' ctype = indata[prefix + 'type'] if indata[prefix + 'name'] == 'KNN': feats = util.get_features(indata, 'distance_') elif ctype == 'kernel': feats = util.get_features(indata, 'kernel_') else: feats = util.get_features(indata, prefix) machine = _get_machine(indata, prefix, feats) try: fun = eval(indata[prefix + 'name']) except NameError, e: print "%s is disabled/unavailable!" % indata[prefix + 'name'] return False
def _evaluate (indata): prefix='classifier_' ctype=indata[prefix+'type'] if indata[prefix+'name']=='KNN': feats=util.get_features(indata, 'distance_') elif ctype=='kernel': feats=util.get_features(indata, 'kernel_') else: feats=util.get_features(indata, prefix) machine=_get_machine(indata, prefix, feats) try: fun=eval(indata[prefix+'name']) except NameError, e: print "%s is disabled/unavailable!"%indata[prefix+'name'] return False
def _evaluate(indata): prefix = "classifier_" ctype = indata[prefix + "type"] if indata[prefix + "name"] == "KNN": feats = util.get_features(indata, "distance_") elif ctype == "kernel": feats = util.get_features(indata, "kernel_") else: feats = util.get_features(indata, prefix) machine = _get_machine(indata, prefix, feats) try: fun = eval(indata[prefix + "name"]) except NameError, e: print "%s is disabled/unavailable!" % indata[prefix + "name"] return False
def _evaluate(indata): prefix = 'kernel_' feats = util.get_features(indata, prefix) kargs = util.get_args(indata, prefix) fun = eval(indata[prefix + 'name'] + 'Kernel') kernel = fun(feats['train'], feats['train'], *kargs) prefix = 'regression_' kernel.parallel.set_num_threads(indata[prefix + 'num_threads']) try: name = indata[prefix + 'name'] if (name == 'KERNELRIDGEREGRESSION'): name = 'KernelRidgeRegression' rfun = eval(name) except NameError as e: print("%s is disabled/unavailable!" % indata[prefix + 'name']) return False labels = RegressionLabels(double(indata[prefix + 'labels'])) if indata[prefix + 'type'] == 'svm': regression = rfun(indata[prefix + 'C'], indata[prefix + 'epsilon'], kernel, labels) elif indata[prefix + 'type'] == 'kernelmachine': regression = rfun(indata[prefix + 'tau'], kernel, labels) else: return False regression.parallel.set_num_threads(indata[prefix + 'num_threads']) if prefix + 'tube_epsilon' in indata: regression.set_tube_epsilon(indata[prefix + 'tube_epsilon']) regression.train() alphas = 0 bias = 0 sv = 0 if prefix + 'bias' in indata: bias = abs(regression.get_bias() - indata[prefix + 'bias']) if prefix + 'alphas' in indata: for item in regression.get_alphas().tolist(): alphas += item alphas = abs(alphas - indata[prefix + 'alphas']) if prefix + 'support_vectors' in indata: for item in inregression.get_support_vectors().tolist(): sv += item sv = abs(sv - indata[prefix + 'support_vectors']) kernel.init(feats['train'], feats['test']) classified = max( abs(regression.apply().get_labels() - indata[prefix + 'classified'])) return util.check_accuracy(indata[prefix + 'accuracy'], alphas=alphas, bias=bias, support_vectors=sv, classified=classified)
def _evaluate (indata): prefix='kernel_' feats=util.get_features(indata, prefix) kargs=util.get_args(indata, prefix) fun=eval(indata[prefix+'name']+'Kernel') kernel=fun(feats['train'], feats['train'], *kargs) prefix='regression_' kernel.parallel.set_num_threads(indata[prefix+'num_threads']) try: name = indata[prefix+'name'] if (name=='KERNELRIDGEREGRESSION'): name = 'KernelRidgeRegression' rfun=eval(name) except NameError as e: print("%s is disabled/unavailable!"%indata[prefix+'name']) return False labels=RegressionLabels(double(indata[prefix+'labels'])) if indata[prefix+'type']=='svm': regression=rfun( indata[prefix+'C'], indata[prefix+'epsilon'], kernel, labels) elif indata[prefix+'type']=='kernelmachine': regression=rfun(indata[prefix+'tau'], kernel, labels) else: return False regression.parallel.set_num_threads(indata[prefix+'num_threads']) if prefix+'tube_epsilon' in indata: regression.set_tube_epsilon(indata[prefix+'tube_epsilon']) regression.train() alphas=0 bias=0 sv=0 if prefix+'bias' in indata: bias=abs(regression.get_bias()-indata[prefix+'bias']) if prefix+'alphas' in indata: for item in regression.get_alphas().tolist(): alphas+=item alphas=abs(alphas-indata[prefix+'alphas']) if prefix+'support_vectors' in indata: for item in inregression.get_support_vectors().tolist(): sv+=item sv=abs(sv-indata[prefix+'support_vectors']) kernel.init(feats['train'], feats['test']) classified=max(abs( regression.apply().get_labels()-indata[prefix+'classified'])) return util.check_accuracy(indata[prefix+'accuracy'], alphas=alphas, bias=bias, support_vectors=sv, classified=classified)
def execute(data): """ :param data: Raw Data frame parsed from CSV :return: Nothing """ # 2. Randomizes the data randomized_data = util.randomize_data(data) # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing training_data_size = 2.0 / 3.0 training_data, test_data = util.split_data(randomized_data, training_data_size) # Capture the predicted outputs training_outputs = training_data[training_data.columns[-1]] # 4. Standardizes the data (except for the last column of course) using the training data training_inputs, training_mean, training_std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front training_inputs.insert(0, "Bias", 1) # 5. Computes the closed-form solution of linear regression weights = find_weights(training_inputs, training_outputs) # 6. Applies the solution to the testing samples test_input = util.get_features(test_data) expected = util.get_output(test_data) actual = apply_solution(test_input, training_mean, training_std, weights) # 7. Computes the root mean squared error (RMSE) rmse = util.compute_rmse(expected, actual) return weights, rmse
def _evaluate (indata): prefix='kernel_' feats=util.get_features(indata, prefix) kargs=util.get_args(indata, prefix) fun=eval(indata[prefix+'name']+'Kernel') kernel=fun(feats['train'], feats['train'], *kargs) prefix='regression_' kernel.parallel.set_num_threads(indata[prefix+'num_threads']) try: rfun=eval(indata[prefix+'name']) except NameError, e: print "%s is disabled/unavailable!"%indata[prefix+'name'] return False
def _evaluate(indata): prefix = 'kernel_' feats = util.get_features(indata, prefix) kargs = util.get_args(indata, prefix) fun = eval(indata[prefix + 'name'] + 'Kernel') kernel = fun(feats['train'], feats['train'], *kargs) prefix = 'regression_' kernel.parallel.set_num_threads(indata[prefix + 'num_threads']) try: rfun = eval(indata[prefix + 'name']) except NameError, e: print "%s is disabled/unavailable!" % indata[prefix + 'name'] return False
def _evaluate(indata): prefix = "kernel_" feats = util.get_features(indata, prefix) kargs = util.get_args(indata, prefix) fun = eval(indata[prefix + "name"] + "Kernel") kernel = fun(feats["train"], feats["train"], *kargs) prefix = "regression_" kernel.parallel.set_num_threads(indata[prefix + "num_threads"]) try: rfun = eval(indata[prefix + "name"]) except NameError, e: print "%s is disabled/unavailable!" % indata[prefix + "name"] return False
def _evaluate(indata): prefix = 'distribution_' feats = util.get_features(indata, prefix) if indata[prefix + 'name'] == 'HMM': distribution = HMM(feats['train'], indata[prefix + 'N'], indata[prefix + 'M'], indata[prefix + 'pseudo']) distribution.train() distribution.baum_welch_viterbi_train(BW_NORMAL) else: dfun = eval(indata[prefix + 'name']) distribution = dfun(feats['train']) distribution.train() likelihood = distribution.get_log_likelihood_sample() num_examples = feats['train'].get_num_vectors() num_param = distribution.get_num_model_parameters() derivatives = 0 for i in xrange(num_param): for j in xrange(num_examples): val = distribution.get_log_derivative(i, j) if val != -inf and val != nan: # only consider sparse matrix! derivatives += val derivatives = abs(derivatives - indata[prefix + 'derivatives']) likelihood = abs(likelihood - indata[prefix + 'likelihood']) if indata[prefix + 'name'] == 'HMM': best_path = 0 best_path_state = 0 for i in xrange(indata[prefix + 'num_examples']): best_path += distribution.best_path(i) for j in xrange(indata[prefix + 'N']): best_path_state += distribution.get_best_path_state(i, j) best_path = abs(best_path - indata[prefix + 'best_path']) best_path_state=abs(best_path_state-\ indata[prefix+'best_path_state']) return util.check_accuracy(indata[prefix + 'accuracy'], derivatives=derivatives, likelihood=likelihood, best_path=best_path, best_path_state=best_path_state) else: return util.check_accuracy(indata[prefix + 'accuracy'], derivatives=derivatives, likelihood=likelihood)
def _evaluate (indata): prefix='distance_' feats=util.get_features(indata, prefix) dfun=eval(indata[prefix+'name']) dargs=util.get_args(indata, prefix) distance=dfun(feats['train'], feats['train'], *dargs) dm_train=max(abs( indata[prefix+'matrix_train']-distance.get_distance_matrix()).flat) distance.init(feats['train'], feats['test']) dm_test=max(abs( indata[prefix+'matrix_test']-distance.get_distance_matrix()).flat) return util.check_accuracy( indata[prefix+'accuracy'], dm_train=dm_train, dm_test=dm_test)
def _evaluate (indata, prefix): feats=util.get_features(indata, prefix) kfun=eval(indata[prefix+'name']+'Kernel') kargs=util.get_args(indata, prefix) kernel=kfun(*kargs) if indata.has_key(prefix+'normalizer'): kernel.set_normalizer(eval(indata[prefix+'normalizer']+'()')) kernel.init(feats['train'], feats['train']) km_train=max(abs( indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test=max(abs( indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat) return util.check_accuracy( indata[prefix+'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate(indata): prefix = "kernel_" feats = util.get_features(indata, prefix) kfun = eval(indata[prefix + "name"] + "Kernel") kargs = util.get_args(indata, prefix) prefix = "preproc_" pargs = util.get_args(indata, prefix) feats = util.add_preproc(indata[prefix + "name"], feats, *pargs) prefix = "kernel_" kernel = kfun(feats["train"], feats["train"], *kargs) km_train = max(abs(indata[prefix + "matrix_train"] - kernel.get_kernel_matrix()).flat) kernel.init(feats["train"], feats["test"]) km_test = max(abs(indata[prefix + "matrix_test"] - kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix + "accuracy"], km_train=km_train, km_test=km_test)
def _evaluate (indata): prefix='distribution_' feats=util.get_features(indata, prefix) if indata[prefix+'name']=='HMM': distribution=HMM(feats['train'], indata[prefix+'N'], indata[prefix+'M'], indata[prefix+'pseudo']) distribution.train() distribution.baum_welch_viterbi_train(BW_NORMAL) else: dfun=eval(indata[prefix+'name']) distribution=dfun(feats['train']) distribution.train() likelihood=distribution.get_log_likelihood_sample() num_examples=feats['train'].get_num_vectors() num_param=distribution.get_num_model_parameters() derivatives=0 for i in xrange(num_param): for j in xrange(num_examples): val=distribution.get_log_derivative(i, j) if val!=-inf and val!=nan: # only consider sparse matrix! derivatives+=val derivatives=abs(derivatives-indata[prefix+'derivatives']) likelihood=abs(likelihood-indata[prefix+'likelihood']) if indata[prefix+'name']=='HMM': best_path=0 best_path_state=0 for i in xrange(indata[prefix+'num_examples']): best_path+=distribution.best_path(i) for j in xrange(indata[prefix+'N']): best_path_state+=distribution.get_best_path_state(i, j) best_path=abs(best_path-indata[prefix+'best_path']) best_path_state=abs(best_path_state-\ indata[prefix+'best_path_state']) return util.check_accuracy(indata[prefix+'accuracy'], derivatives=derivatives, likelihood=likelihood, best_path=best_path, best_path_state=best_path_state) else: return util.check_accuracy(indata[prefix+'accuracy'], derivatives=derivatives, likelihood=likelihood)
def main(): t1 = time.time() X, y, raw = util.get_data('../data/subset.csv') new_features = util.get_features(raw) # get homegrown features vect = TfidfVectorizer(min_df=2) X_dtm = vect.fit_transform(X) info_gains = np.apply_along_axis(util.info_gain, 0, X_dtm.toarray(), y, 0.00001) num_features = 2000 max_cols = info_gains.argsort()[-num_features:][::-1] # print_vocab(vect, max_cols) X = X_dtm[:, max_cols].toarray() # turn X from sparse matrix to numpy array for new_feature in new_features: # add our features as columns to X X = np.append(X, new_feature.reshape(-1, 1), axis=1) print("data matrix shape", X.shape) print("preprocessing took", str(time.time() - t1), "seconds") tune_rbf(X, y)
def _evaluate(indata, prefix): feats = util.get_features(indata, prefix) kfun = eval(indata[prefix + 'name'] + 'Kernel') kargs = util.get_args(indata, prefix) kernel = kfun(*kargs) if indata.has_key(prefix + 'normalizer'): kernel.set_normalizer(eval(indata[prefix + 'normalizer'] + '()')) kernel.init(feats['train'], feats['train']) km_train = max( abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test = max( abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix + 'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate(indata): prefix = 'distance_' feats = util.get_features(indata, prefix) dfun = eval(indata[prefix + 'name']) dargs = util.get_args(indata, prefix) distance = dfun(feats['train'], feats['train'], *dargs) dm_train = max( abs(indata[prefix + 'matrix_train'] - distance.get_distance_matrix()).flat) distance.init(feats['train'], feats['test']) dm_test = max( abs(indata[prefix + 'matrix_test'] - distance.get_distance_matrix()).flat) return util.check_accuracy(indata[prefix + 'accuracy'], dm_train=dm_train, dm_test=dm_test)
def _evaluate_combined (indata, prefix): kernel=CombinedKernel() feats={'train':CombinedFeatures(), 'test':CombinedFeatures()} subkernels=_get_subkernels(indata, prefix) for subk in subkernels.itervalues(): feats_subk=util.get_features(subk, '') feats['train'].append_feature_obj(feats_subk['train']) feats['test'].append_feature_obj(feats_subk['test']) kernel.append_kernel(subk['kernel']) kernel.init(feats['train'], feats['train']) km_train=max(abs( indata['kernel_matrix_train']-kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test=max(abs( indata['kernel_matrix_test']-kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix+'accuracy'], km_train=km_train, km_test=km_test)
def extractor(cap, dim): _, frame = cap.read() frame = scale(frame, dim) prev_gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) prev_flow = None for i in count(0): _, frame = cap.read() if frame is None: break frame = scale(frame, dim) gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) flow = cv.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0) mag, ang = cv.cartToPolar(flow[..., 0], flow[..., 1]) if i > 1: yield get_features(prev_flow, flow) prev_gray = gray prev_flow = flow
def _evaluate_auc (indata, prefix): subk=_get_subkernels(indata, prefix)['0'] feats_subk=util.get_features(subk, '') subk['kernel'].init(feats_subk['train'], feats_subk['test']) feats={ 'train': WordFeatures(indata[prefix+'data_train'].astype(ushort)), 'test': WordFeatures(indata[prefix+'data_test'].astype(ushort)) } kernel=AUCKernel(10, subk['kernel']) kernel.init(feats['train'], feats['train']) km_train=max(abs( indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test=max(abs( indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix+'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate (indata): prefix='kernel_' feats=util.get_features(indata, prefix) kfun=eval(indata[prefix+'name']+'Kernel') kargs=util.get_args(indata, prefix) prefix='preprocessor_' pargs=util.get_args(indata, prefix) feats=util.add_preprocessor(indata[prefix+'name'], feats, *pargs) prefix='kernel_' kernel=kfun(feats['train'], feats['train'], *kargs) km_train=max(abs( indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test=max(abs( indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat) return util.check_accuracy( indata[prefix+'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate_top_fisher(indata, prefix): feats = {} wordfeats = util.get_features(indata, prefix) pos_train = HMM(wordfeats['train'], indata[prefix + 'N'], indata[prefix + 'M'], indata[prefix + 'pseudo']) pos_train.train() pos_train.baum_welch_viterbi_train(BW_NORMAL) neg_train = HMM(wordfeats['train'], indata[prefix + 'N'], indata[prefix + 'M'], indata[prefix + 'pseudo']) neg_train.train() neg_train.baum_welch_viterbi_train(BW_NORMAL) pos_test = HMM(pos_train) pos_test.set_observations(wordfeats['test']) neg_test = HMM(neg_train) neg_test.set_observations(wordfeats['test']) if indata[prefix + 'name'] == 'TOP': feats['train'] = TOPFeatures(10, pos_train, neg_train, False, False) feats['test'] = TOPFeatures(10, pos_test, neg_test, False, False) else: feats['train'] = FKFeatures(10, pos_train, neg_train) feats['train'].set_opt_a(-1) #estimate prior feats['test'] = FKFeatures(10, pos_test, neg_test) feats['test'].set_a( feats['train'].get_a()) #use prior from training data prefix = 'kernel_' args = util.get_args(indata, prefix) kernel = PolyKernel(feats['train'], feats['train'], *args) # kernel=PolyKernel(*args) # kernel.init(feats['train'], feats['train']) km_train = max( abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test = max( abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix + 'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate_combined(indata, prefix): kernel = CombinedKernel() feats = {'train': CombinedFeatures(), 'test': CombinedFeatures()} subkernels = _get_subkernels(indata, prefix) for subk in subkernels.itervalues(): feats_subk = util.get_features(subk, '') feats['train'].append_feature_obj(feats_subk['train']) feats['test'].append_feature_obj(feats_subk['test']) kernel.append_kernel(subk['kernel']) kernel.init(feats['train'], feats['train']) km_train = max( abs(indata['kernel_matrix_train'] - kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test = max( abs(indata['kernel_matrix_test'] - kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix + 'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate_auc(indata, prefix): subk = _get_subkernels(indata, prefix)['0'] feats_subk = util.get_features(subk, '') subk['kernel'].init(feats_subk['train'], feats_subk['test']) feats = { 'train': WordFeatures(indata[prefix + 'data_train'].astype(ushort)), 'test': WordFeatures(indata[prefix + 'data_test'].astype(ushort)) } kernel = AUCKernel(10, subk['kernel']) kernel.init(feats['train'], feats['train']) km_train = max( abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test = max( abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix + 'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate(indata): prefix = 'kernel_' feats = util.get_features(indata, prefix) kfun = eval(indata[prefix + 'name'] + 'Kernel') kargs = util.get_args(indata, prefix) prefix = 'preproc_' pargs = util.get_args(indata, prefix) feats = util.add_preproc(indata[prefix + 'name'], feats, *pargs) prefix = 'kernel_' kernel = kfun(feats['train'], feats['train'], *kargs) km_train = max( abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test = max( abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix + 'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate_pie (indata, prefix): pie=PluginEstimate() feats=util.get_features(indata, prefix) labels=BinaryLabels(double(indata['classifier_labels'])) pie.set_labels(labels) pie.set_features(feats['train']) pie.train() fun=eval(indata[prefix+'name']+'Kernel') kernel=fun(feats['train'], feats['train'], pie) km_train=max(abs( indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) pie.set_features(feats['test']) km_test=max(abs( indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat) classified=max(abs( pie.apply().get_values()-indata['classifier_classified'])) return util.check_accuracy(indata[prefix+'accuracy'], km_train=km_train, km_test=km_test, classified=classified)
def _evaluate_top_fisher (indata, prefix): feats={} wordfeats=util.get_features(indata, prefix) pos_train=HMM(wordfeats['train'], indata[prefix+'N'], indata[prefix+'M'], indata[prefix+'pseudo']) pos_train.train() pos_train.baum_welch_viterbi_train(BW_NORMAL) neg_train=HMM(wordfeats['train'], indata[prefix+'N'], indata[prefix+'M'], indata[prefix+'pseudo']) neg_train.train() neg_train.baum_welch_viterbi_train(BW_NORMAL) pos_test=HMM(pos_train) pos_test.set_observations(wordfeats['test']) neg_test=HMM(neg_train) neg_test.set_observations(wordfeats['test']) if indata[prefix+'name']=='TOP': feats['train']=TOPFeatures(10, pos_train, neg_train, False, False) feats['test']=TOPFeatures(10, pos_test, neg_test, False, False) else: feats['train']=FKFeatures(10, pos_train, neg_train) feats['train'].set_opt_a(-1) #estimate prior feats['test']=FKFeatures(10, pos_test, neg_test) feats['test'].set_a(feats['train'].get_a()) #use prior from training data prefix='kernel_' args=util.get_args(indata, prefix) kernel=PolyKernel(feats['train'], feats['train'], *args) # kernel=PolyKernel(*args) # kernel.init(feats['train'], feats['train']) km_train=max(abs( indata[prefix+'matrix_train']-kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) km_test=max(abs( indata[prefix+'matrix_test']-kernel.get_kernel_matrix()).flat) return util.check_accuracy(indata[prefix+'accuracy'], km_train=km_train, km_test=km_test)
def _evaluate_pie(indata, prefix): pie = PluginEstimate() feats = util.get_features(indata, prefix) labels = BinaryLabels(double(indata['classifier_labels'])) pie.set_labels(labels) pie.set_features(feats['train']) pie.train() fun = eval(indata[prefix + 'name'] + 'Kernel') kernel = fun(feats['train'], feats['train'], pie) km_train = max( abs(indata[prefix + 'matrix_train'] - kernel.get_kernel_matrix()).flat) kernel.init(feats['train'], feats['test']) pie.set_features(feats['test']) km_test = max( abs(indata[prefix + 'matrix_test'] - kernel.get_kernel_matrix()).flat) classified = max( abs(pie.apply().get_confidences() - indata['classifier_classified'])) return util.check_accuracy(indata[prefix + 'accuracy'], km_train=km_train, km_test=km_test, classified=classified)
def execute(data, learning_rate=0.001, training_data_ratio=2.0 / 3, max_iterations=1000000): """ Perform Batch Gradient Descent :param data: Raw Data frame parsed from CSV :param learning_rate: The rate at which to advance along the gradient :param training_data_ratio: The percent of given data to use for training (remaining percent is used for testing) :param max_iterations: The maximum number of iterations to execute before exiting :return: Nothing """ # 2. Randomizes the data print "Randomizing Data" randomized_data = util.randomize_data(data) # 3. Selects the first 2 / 3 (round up) of the data for training and the remaining for testing print "Selecting Training Data" training_data, test_data = util.split_data(randomized_data, training_data_ratio) # 4. Standardizes the data(except for the last column of course) base on the training data print "Standardizing Data" std_training_data, mean, std = util.standardize_data( util.get_features(training_data)) std_training_data.insert(0, "Bias", 1) std_test_data, _, _ = util.standardize_data(util.get_features(test_data), mean, std) std_test_data.insert(0, "Bias", 1) iteration = 0 prior_rmse = 0 current_rmse = 100 # Doesn't matter what this value is, so long as it doesn't equal prior rmse eps = np.spacing(1) N = len(std_training_data) # Start with randomized values for theta theta = np.array([random.uniform(-1, 1) for _ in xrange(0, 3)]) # Capture our expected values for the training data expected = util.get_output(training_data) test_data_expected = util.get_output(test_data) # Capture the RMSE for test and training over all iterations test_rmse_values = [] training_rmse_values = [] print "Performing Gradient Descent Linear Regression" # 5. While the termination criteria (mentioned above in the implementation details) hasn't been met while iteration <= max_iterations and abs(current_rmse - prior_rmse) >= eps: prior_rmse = current_rmse # (a) Compute the RMSE of the training data # By applying the current theta values to the training set & comparing results actual = std_training_data.dot(theta) current_rmse = util.compute_rmse(expected, actual) # (b) While we can't let the testing set affect our training process, also compute the RMSE of # the testing error at each iteration of the algorithm (it'll be interesting to see). # Same thing as (a), but use test inputs / outputs test_data_actual = std_test_data.dot(theta) test_data_rmse = util.compute_rmse(test_data_expected, test_data_actual) # (c) Update each parameter using batch gradient descent # By use of the learning rate for i in xrange(len(theta)): # We know the length of theta is the same as the num columns in std_training_data errors = (actual - expected ) * std_training_data[std_training_data.columns[i]] cumulative_error = errors.sum() theta[i] -= learning_rate / N * cumulative_error iteration += 1 test_rmse_values.append(test_data_rmse) training_rmse_values.append(current_rmse) print "Completed in {0} iterations".format(iteration) print "Plotting Errors" image_path = plot_rmse_values(test_rmse_values, training_rmse_values, learning_rate) print "Saved Image to '{0}'".format(image_path) # 6. Compute the RMSE of the testing data. print "Computing RMSE of Test Data" test_data_actual = std_test_data.dot(theta) test_data_rmse = util.compute_rmse(test_data_expected, test_data_actual) return theta, test_data_rmse
hidden = 100 most_common = 1600 filename = './data/test.en' epsilon_std = 1.0 window_size = 5 context_sz = window_size * 2 tr_word2idx, tr_idx2word, sent_train = util.read_input(filename, most_common=most_common) tst_word2idx, tst_idx2word, sent_test = util.read_input('./data/test.en') corpus_dim = len(tr_word2idx) original_dim = corpus_dim x_train = util.get_features(sent_train, tr_word2idx, window_size, emb_sz) corpus_sz = len(tr_word2idx) flatten_sz = x_train.shape[0] * x_train.shape[1] emb_sz_2 = emb_sz * 2 #x_train_hat = np.reshape(x_train, (flatten_sz,emb_sz_2)) #print('shape x_train_hat=', x_train_hat.shape) # ENCODER x = Input(shape=(2, )) R = Embedding(input_dim=original_dim, output_dim=emb_sz)(x) R = Reshape((-1, emb_sz_2))(R) print('shape R=', R.shape) M = Dense(hidden)(R)
def _evaluate(indata): prefix = 'classifier_' ctype = indata[prefix + 'type'] if indata[prefix + 'name'] == 'KNN': feats = util.get_features(indata, 'distance_') elif ctype == 'kernel': feats = util.get_features(indata, 'kernel_') else: feats = util.get_features(indata, prefix) machine = _get_machine(indata, prefix, feats) try: fun = eval(indata[prefix + 'name']) except NameError as e: print("%s is disabled/unavailable!" % indata[prefix + 'name']) return False # cannot refactor into function, because labels is unrefed otherwise if prefix + 'labels' in indata: labels = BinaryLabels(double(indata[prefix + 'labels'])) if ctype == 'kernel': classifier = fun(indata[prefix + 'C'], machine, labels) elif ctype == 'linear': classifier = fun(indata[prefix + 'C'], feats['train'], labels) elif ctype == 'knn': classifier = fun(indata[prefix + 'k'], machine, labels) elif ctype == 'lda': classifier = fun(indata[prefix + 'gamma'], feats['train'], labels) elif ctype == 'perceptron': classifier = fun(feats['train'], labels) elif ctype == 'wdsvmocas': classifier = fun(indata[prefix + 'C'], indata[prefix + 'degree'], indata[prefix + 'degree'], feats['train'], labels) else: return False else: classifier = fun(indata[prefix + 'C'], machine) if classifier.get_name() == 'LibLinear': print(classifier.get_name(), "yes") classifier.set_liblinear_solver_type(L2R_LR) classifier.parallel.set_num_threads(indata[prefix + 'num_threads']) if ctype == 'linear': if prefix + 'bias' in indata: classifier.set_bias_enabled(True) else: classifier.set_bias_enabled(False) if ctype == 'perceptron': classifier.set_learn_rate = indata[prefix + 'learn_rate'] classifier.set_max_iter = indata[prefix + 'max_iter'] if prefix + 'epsilon' in indata: try: classifier.set_epsilon(indata[prefix + 'epsilon']) except AttributeError: pass if prefix + 'max_train_time' in indata: classifier.set_max_train_time(indata[prefix + 'max_train_time']) if prefix + 'linadd_enabled' in indata: classifier.set_linadd_enabled(indata[prefix + 'linadd_enabled']) if prefix + 'batch_enabled' in indata: classifier.set_batch_computation_enabled(indata[prefix + 'batch_enabled']) classifier.train() res = _get_results(indata, prefix, classifier, machine, feats) return util.check_accuracy(res['accuracy'], alphas=res['alphas'], bias=res['bias'], sv=res['sv'], classified=res['classified'])
def _evaluate (indata): prefix='classifier_' ctype=indata[prefix+'type'] if indata[prefix+'name']=='KNN': feats=util.get_features(indata, 'distance_') elif ctype=='kernel': feats=util.get_features(indata, 'kernel_') else: feats=util.get_features(indata, prefix) machine=_get_machine(indata, prefix, feats) try: fun=eval(indata[prefix+'name']) except NameError as e: print("%s is disabled/unavailable!"%indata[prefix+'name']) return False # cannot refactor into function, because labels is unrefed otherwise if prefix+'labels' in indata: labels=BinaryLabels(double(indata[prefix+'labels'])) if ctype=='kernel': classifier=fun(indata[prefix+'C'], machine, labels) elif ctype=='linear': classifier=fun(indata[prefix+'C'], feats['train'], labels) elif ctype=='knn': classifier=fun(indata[prefix+'k'], machine, labels) elif ctype=='lda': classifier=fun(indata[prefix+'gamma'], feats['train'], labels) elif ctype=='perceptron': classifier=fun(feats['train'], labels) elif ctype=='wdsvmocas': classifier=fun(indata[prefix+'C'], indata[prefix+'degree'], indata[prefix+'degree'], feats['train'], labels) else: return False else: classifier=fun(indata[prefix+'C'], machine) if classifier.get_name() == 'LibLinear': print(classifier.get_name(), "yes") classifier.set_liblinear_solver_type(L2R_LR) classifier.parallel.set_num_threads(indata[prefix+'num_threads']) if ctype=='linear': if prefix+'bias' in indata: classifier.set_bias_enabled(True) else: classifier.set_bias_enabled(False) if ctype=='perceptron': classifier.set_learn_rate=indata[prefix+'learn_rate'] classifier.set_max_iter=indata[prefix+'max_iter'] if prefix+'epsilon' in indata: try: classifier.set_epsilon(indata[prefix+'epsilon']) except AttributeError: pass if prefix+'max_train_time' in indata: classifier.set_max_train_time(indata[prefix+'max_train_time']) if prefix+'linadd_enabled' in indata: classifier.set_linadd_enabled(indata[prefix+'linadd_enabled']) if prefix+'batch_enabled' in indata: classifier.set_batch_computation_enabled(indata[prefix+'batch_enabled']) classifier.train() res=_get_results(indata, prefix, classifier, machine, feats) return util.check_accuracy(res['accuracy'], alphas=res['alphas'], bias=res['bias'], sv=res['sv'], classified=res['classified'])
emb_sz=100 hidden=100 most_common = 1600 filename = './data/test.en' epsilon_std = 1.0 window_size=5 context_sz=window_size*2 tr_word2idx, tr_idx2word, sent_train, corpus = util.read_input(filename, most_common=most_common) tst_word2idx, tst_idx2word, sent_test, corpus = util.read_input('./data/test.en') corpus_dim = len(tr_word2idx) original_dim = corpus_dim contexts, targets= util.get_features(sent_train, tr_word2idx, window_size, emb_sz) corpus_sz = len(tr_word2idx) emb_sz_2 = emb_sz*2 def concat(input): return(K.concatenate([input[0], input[1]])) def sampling(args): # Reparametrization trick z_mean, z_log_var = args print('shape z_mean sampling=', z_log_var.shape, 'shape z_log_var=', z_log_var.shape) epsilon = K.random_normal(shape=(K.shape(z_mean)[0], emb_sz), mean=0., stddev=epsilon_std)
import mex from keras.utils import np_utils import util path = '/Users/anjanawijekoon/MEx_wtpm/' # read all data all_data = mex.read_all(path) # extract windows from all three sensors all_features = mex.extract_features(all_data) # get features by sensor index acw_features = util.get_features(all_features, 0) act_features = util.get_features(all_features, 1) pm_features = util.get_features(all_features, 2) # get all people ids all_people = all_features.keys() # pm # to make sure all windows have same length padded_pm_features = util.pad_features(pm_features) # to reduce the frame rate to mex.frames_per_second rate reduced_pm_features = util.frame_reduce(padded_pm_features) for i in range(len(all_people)): test_persons = [all_people[i]] pm_train_features, pm_test_features = util.train_test_split( reduced_pm_features, test_persons) pm_train_features, pm_train_labels = util.flatten(pm_train_features)