def setUp(self): self.train_df, self.test_df = get_train_test_split() self.classes = constants["classes"] self.KNN = KNN(k=4, classes=self.classes) self.KNN.fit(self.train_df) self.NaiveBayes = NaiveBayes(n=3, classes=self.classes) self.NaiveBayes.fit(self.train_df) self.Linear = Linear(classes=self.classes, max_len=40) self.Linear.fit(self.train_df, epochs=1) self.W2V = W2V(classes=self.classes)
def run_model(args, X, y, ensembler = False): model = None if args['model'] == 'logistic': logistic = Logistic(X,y, model) model = logistic.train_model() elif args['model'] == 'knn': knn = KNN(X,y, model) model = knn.train_model() elif args['model'] == 'svm': svm = SVM(X,y, model) model = svm.train_model() elif args['model'] == 'rfa': rfa = RandomForest(X, y, model) model = rfa.train_model(ensembler) elif args['model'] == 'xgb': xgb = XGB(X, y, model) model = xgb.train_model(ensembler) elif args['model'] == 'lgbm': lgbm = LightGBM(X, y, model) model = lgbm.train_model(ensembler) elif args['model'] == 'catboost': catboost = CatBoost(X, y, model) model = catboost.train_model(ensembler) elif len(args['models']) > 1: models = [('', None)]* len(args['models']) for i in range(len(args['models'])): model_name = args['models'][i] temp_args = copy.deepcopy(args) temp_args['model'] = model_name models[i] = (model_name, run_model(temp_args, X, y, True)) ensembler = Ensembler(X, y, model, args['ensembler_type']) model = ensembler.train_model(models) return model else: print('\nInvalid model name :-|\n') exit() return model
def test_knn(): from models.knn import KNN x, y = np.random.randn(3, 200, 2), np.zeros([3, 200]) x[0] += np.array([2, 2]) # 右偏移2,上偏移2 x[1] += np.array([2, -2]) # 右偏移2,下偏移2 y[1] = 1 y[2] = 2 plot_scatter(x, 'Real') x = x.reshape(-1, 2) y = y.flatten() # train knn = KNN(3) knn.fit(x, y) pred = knn.predict(x) plot_scatter([x[pred == i] for i in [0, 1, 2]], 'Pred') # print accuracy acc = np.sum(pred == y) / len(pred) print(f'Acc = {100 * acc:.2f}%')
def __init__( self, root, train=True, transform=None, augment_transform=None, target_transform=None, download=False, ): super(AugmentedMNIST, self).__init__(root, train, transform, target_transform, download) self.augment_transform = augment_transform self.knn = KNN()
def walk_forward_cv(self): """ Runs walk-forward cross-validation, and saves cross-validation metrics. """ for output_name in self.output_names: print('\t\t\t|--Prediction type: {}'.format(output_name)) optimal_params_by_model = {} cv_metadata_by_model = {} cv_predictions_by_model = {} print('\t\t\t\t|--KNN Model') knn = KNN() knn.cv_params = self.cv_params knn.test_name = self.test_name knn.full_df = self.full_df knn.feature_names = self.feature_names knn.output_name = output_name knn.run_knn_cv() optimal_params_by_model['KNN'] = knn.knn_optimal_params cv_predictions_by_model['KNN'] = knn.knn_cv_predictions print('\t\t\t\t|--Elastic Net Model') elastic_net = ElasticNet() elastic_net.cv_params = self.cv_params elastic_net.test_name = self.test_name elastic_net.full_df = self.full_df elastic_net.feature_names = self.feature_names elastic_net.feature_dict = self.feature_dict elastic_net.output_name = output_name elastic_net.run_elastic_net_cv() optimal_params_by_model[ 'Elastic_Net'] = elastic_net.elastic_net_optimal_params cv_metadata_by_model['Elastic_Net'] = elastic_net.metadata cv_predictions_by_model[ 'Elastic_Net'] = elastic_net.elastic_net_cv_predictions print('\t\t\t\t|--Naive Bayes Model') naive_bayes = NaiveBayes() naive_bayes.cv_params = self.cv_params naive_bayes.test_name = self.test_name naive_bayes.full_df = self.full_df naive_bayes.feature_names = self.feature_names naive_bayes.feature_dict = self.feature_dict naive_bayes.output_name = output_name naive_bayes.run_bayes_cv() cv_predictions_by_model[ 'Naive_Bayes'] = naive_bayes.bayes_cv_predictions optimal_params_by_model[ 'Naive_Bayes'] = naive_bayes.bayes_optimal_params print('\t\t\t\t|--SVM Model') svm = SupportVectorMachine() svm.cv_params = self.cv_params svm.test_name = self.test_name svm.full_df = self.full_df svm.feature_names = self.feature_names svm.output_name = output_name svm.run_svm_cv() optimal_params_by_model['SVM'] = svm.svm_optimal_params cv_metadata_by_model['SVM'] = svm.metadata cv_predictions_by_model['SVM'] = svm.svm_cv_predictions print('\t\t\t\t|--Gaussian Process Model') gauss = GaussianProcess() gauss.cv_params = self.cv_params gauss.test_name = self.test_name gauss.full_df = self.full_df gauss.feature_names = self.feature_names gauss.feature_dict = self.feature_dict gauss.output_name = output_name gauss.run_gauss_cv() cv_predictions_by_model[ 'Gaussian_Process'] = gauss.gauss_cv_predictions cv_metadata_by_model['Gaussian_Process'] = gauss.metadata optimal_params_by_model[ 'Gaussian_Process'] = gauss.gauss_optimal_params print('\t\t\t\t|--XGBoost Model') xgboost = XGBoost() xgboost.cv_params = self.cv_params xgboost.test_name = self.test_name xgboost.full_df = self.full_df xgboost.feature_names = self.feature_names xgboost.feature_dict = self.feature_dict xgboost.output_name = output_name xgboost.run_xgboost_cv() optimal_params_by_model['XGBoost'] = xgboost.xgboost_optimal_params cv_metadata_by_model['XGBoost'] = xgboost.metadata cv_predictions_by_model['XGBoost'] = xgboost.xgboost_cv_predictions self.optimal_params_by_output[ output_name] = optimal_params_by_model self.cv_metadata_by_output[output_name] = cv_metadata_by_model self.cv_predictions_by_output[ output_name] = cv_predictions_by_model
def walk_forward_prediction(self): """ Runs walk-forward prediction, and saves prediction metrics. """ for output_name in self.output_names: print('\t\t\t|--Prediction type: {}'.format(output_name)) prediction_errors_by_model = {} predictions_by_model = {} pred_metadata_by_model = {} print('\t\t\t\t|--KNN Model') knn = KNN() knn.pred_indices = self.pred_indices knn.full_df = self.full_df knn.feature_names = self.feature_names knn.output_name = output_name knn.knn_optimal_params = self.optimal_params_by_output[ output_name]['KNN'] knn.run_knn_prediction() prediction_errors_by_model['KNN'] = knn.knn_pred_error predictions_by_model['KNN'] = knn.knn_predictions print('\t\t\t\t|--Elastic Net Model') elastic_net = ElasticNet() elastic_net.pred_indices = self.pred_indices elastic_net.full_df = self.full_df elastic_net.feature_names = self.feature_names elastic_net.feature_dict = self.feature_dict elastic_net.output_name = output_name elastic_net.elastic_net_optimal_params = self.optimal_params_by_output[ output_name]['Elastic_Net'] elastic_net.run_elastic_net_prediction() prediction_errors_by_model[ 'Elastic_Net'] = elastic_net.elastic_net_pred_error predictions_by_model[ 'Elastic_Net'] = elastic_net.elastic_net_predictions pred_metadata_by_model['Elastic_Net'] = elastic_net.metadata print('\t\t\t\t|--Naive Bayes Model') naive_bayes = NaiveBayes() naive_bayes.pred_indices = self.pred_indices naive_bayes.full_df = self.full_df naive_bayes.feature_names = self.feature_names naive_bayes.output_name = output_name naive_bayes.run_bayes_prediction() prediction_errors_by_model[ 'Naive_Bayes'] = naive_bayes.bayes_pred_error predictions_by_model['Naive_Bayes'] = naive_bayes.bayes_predictions print('\t\t\t\t|--SVM Model') svm = SupportVectorMachine() svm.pred_indices = self.pred_indices svm.full_df = self.full_df svm.feature_names = self.feature_names svm.output_name = output_name svm.svm_optimal_params = self.optimal_params_by_output[ output_name]['SVM'] svm.run_svm_prediction() prediction_errors_by_model['SVM'] = svm.svm_pred_error predictions_by_model['SVM'] = svm.svm_predictions pred_metadata_by_model['SVM'] = svm.metadata print('\t\t\t\t|--Gaussian Process Model') gauss = GaussianProcess() gauss.pred_indices = self.pred_indices gauss.full_df = self.full_df gauss.feature_names = self.feature_names gauss.output_name = output_name gauss.run_gauss_prediction() prediction_errors_by_model[ 'Gaussian_Process'] = gauss.gauss_pred_error predictions_by_model['Gaussian_Process'] = gauss.gauss_predictions pred_metadata_by_model['Gaussian_Process'] = gauss.metadata print('\t\t\t\t|--XGBoost Model') xgboost = XGBoost() xgboost.pred_indices = self.pred_indices xgboost.full_df = self.full_df xgboost.feature_names = self.feature_names xgboost.feature_dict = self.feature_dict xgboost.output_name = output_name xgboost.xgboost_optimal_params = self.optimal_params_by_output[ output_name]['XGBoost'] xgboost.run_xgboost_prediction() prediction_errors_by_model['XGBoost'] = xgboost.xgboost_pred_error predictions_by_model['XGBoost'] = xgboost.xgboost_predictions pred_metadata_by_model['XGBoost'] = xgboost.metadata print('\t\t\t\t|--Weighted Average Model') weighted_average = WeightedAverage() weighted_average.model_names = self.model_names weighted_average.cv_results = self.optimal_params_by_output[ output_name] weighted_average.predictions_by_model = predictions_by_model weighted_average.run_weighted_average_prediction() predictions_by_model[ 'Weighted_Average'] = weighted_average.weighted_average_predictions pred_metadata_by_model[ 'Weighted_Average'] = weighted_average.metadata self.prediction_errors_by_output[ output_name] = prediction_errors_by_model self.predictions_by_output[output_name] = predictions_by_model self.pred_metadata_by_output[output_name] = pred_metadata_by_model
from models.utils import Dataset from models.knn import KNN, SimilarityMetrics if __name__ == '__main__': dataset = Dataset.from_csv_file('data/ratings.csv') trainset, testset = dataset.split_into_train_and_test_sets(0.7) knn = KNN() knn.fit(trainset) print(knn.predict(1, 1))
class ModelTests(unittest.TestCase): def setUp(self): self.train_df, self.test_df = get_train_test_split() self.classes = constants["classes"] self.KNN = KNN(k=4, classes=self.classes) self.KNN.fit(self.train_df) self.NaiveBayes = NaiveBayes(n=3, classes=self.classes) self.NaiveBayes.fit(self.train_df) self.Linear = Linear(classes=self.classes, max_len=40) self.Linear.fit(self.train_df, epochs=1) self.W2V = W2V(classes=self.classes) def test_knn_io(self): """ Test that KNN model takes the right inputs and outputs a dictionary with all possible class """ pred, output = self.KNN("BREST") self.assertIsInstance(output, dict) self.assertIn(pred, self.classes) for label in self.classes: self.assertIn(label, output.keys()) def test_knn_output_probabilities(self): """ Test that KNN model returns probabilities for each possible class """ _, output = self.KNN("RADE DE BREST") # sums up to one self.assertLess(abs(sum(output.values()) - 1), 1e-3) # all values between 0 and 1 for value in output.values(): self.assertGreaterEqual(value, 0) self.assertLessEqual(value, 1) def test_knn_case_unsensitive(self): pred_upper, output_upper = self.KNN("BREST") pred_lower, output_lower = self.KNN("brest") self.assertEqual(pred_upper, pred_lower) self.assertListEqual(list(output_upper.items()), list(output_lower.items())) def test_naive_bayes_io(self): """ Test that Naive Bayes model takes the right inputs and outputs a dictionary with all possible class """ pred, output = self.NaiveBayes("BREST") self.assertIn(pred, self.classes) self.assertIsInstance(output, dict) # def test_naive_bayes_output_probabilities(self): # _, output = self.NaiveBayes("BREST") # self.assertLess(abs(sum(output.values()) - 1), 1e-3) # for label in self.classes: # self.assertIn(label, output.keys()) def test_linear_io(self): """ Test that Linear model takes the right inputs and outputs a dictionary with all possible class """ pred, output = self.Linear("BREST") self.assertIn(pred, self.classes) self.assertIsInstance(output, dict) def test_linear_output_probabilities(self): _, output = self.Linear("BREST") self.assertLess(abs(sum(output.values()) - 1), 1e-3) for label in self.classes: self.assertIn(label, output.keys()) def test_w2v_io(self): """ Test that Word2Vec model takes the right inputs and outputs a dictionary with all possible class """ pred, output = self.W2V("BREST") self.assertIn(pred, self.classes) self.assertIsInstance(output, dict) def test_w2v_output_probabilities(self): _, output = self.W2V("BREST") self.assertLess(abs(sum(output.values()) - 1), 1e-3) for label in self.classes: self.assertIn(label, output.keys())
from flask import Flask, request, jsonify, render_template from flask_cors import CORS from models.knn import KNN from src.constants import constants from src.data import get_train_test_split, regexp_processing model = KNN(classes=constants['classes'], k=3) train, _ = get_train_test_split('../10_ports.csv', split=1) model.fit(train) app = Flask(__name__) CORS(app) @app.route('/predict', methods=['POST']) def predict(): if request.method == "POST": destination = request.data.decode('utf-8') if destination.upper() in model.destinations.keys(): return model.destinations[destination.upper()] pred = model(destination) return pred[0] if __name__ == '__main__': # Threaded option to enable multiple instances for multiple user access support app.run(threaded=True, port=5000)
def main(args): # Import business dataset #business_df = get_POI_df(args.path+'yelp_dataset_business.json') #Import Cleaned_Toronto_Business business_df = get_POI_df_toronto(args.path+'Cleaned_Toronto_Business.json') # Filter business dataset by city #business_df = filter_by_city(business_df, city=args.city) comment out since all toronto restaurants # Filter business dataset by restaurants #business_df = select_restaurants(business_df) comment out since already cleaned # Import review dataset #review_df = get_review_df(args.path+'yelp_dataset_review.json') #Import Toronto review dataset review_df = get_review_df_toronto(args.path+'Cleaned_Toronto_Reviews.json') # Binarize review stars, adding a new column called review_stars_binarized review_df = binarized_star(review_df) print('review df columns', review_df.columns) print('business_df columns', business_df.columns) # Merge business df and review df rating_df = merge_df(review_df, business_df, on_column='business_id', how_merge='inner', columns=["user_id", "business_id", "date", "review_stars", "review_text", "review_stars_binary", "categories", "latitude", "longitude"], sort_column='date') num_cols = rating_df.business_id.nunique() num_rows = rating_df.user_id.nunique() print('unique businesses:', num_cols, 'unique users', num_rows) print('unique user id:', rating_df.user_id.nunique()) # Assign numbers to user_id -> user_num_id rating_df['user_num_id'] = rating_df.user_id.astype('category').\ cat.rename_categories(range(0, rating_df.user_id.nunique())) rating_df['user_num_id'] = rating_df['user_num_id'].astype('int') #Encode business_num_id rating_df['business_num_id'] = rating_df.business_id.astype('category').\ cat.rename_categories(range(0, rating_df.business_id.nunique())) rating_df['business_num_id'] = rating_df['business_num_id'].astype('int') rating_df = rating_df.reset_index() # Get all restaurants latitude and longitude df POI_lat_long_df = return_POI_lat_long_df(rating_df) # Export all restaurants latitude and longitude df save_POI_lat_long_df(args.path+"POI_lat_long_df", POI_lat_long_df) # Get pandas user_id and business_id dict user_id_dict = pandas_to_dict(rating_df, "user_id", "user_num_id") # Export dict to disk as json save_user_id_dict_pickle(args.path, user_id_dict, 'user_id_dict') # Split into train and test dataset train_df, test_df = train_test_split(rating_df) # Form train set and test set #train_set generate UI matrix ''' Computing binary rating UI for train and test Computing raw rating UI for train and test Combine both for entire dataset ''' #Getting both thresholded and raw user item review (UI) matrix train_set_binary = df_to_sparse(train_df, num_rows, num_cols) test_set_binary = df_to_sparse(test_df, num_rows, num_cols) train_set_rawRating = df_to_sparse(train_df, num_rows, num_cols, binarySetting=False) train_set_rawRating = df_to_sparse(train_df, num_rows, num_cols, binarySetting=False) entire_set_binary = train_set_binary + test_set_binary entire_set_raw = train_set_rawRating + train_set_rawRating #Sorting both binary, rawRating UI matrix, and entire UI matrix save_npz_data(args.path+"toronto_train_set_binary.npz", train_set_binary) save_npz_data(args.path+"toronto_test_set_binary.npz", test_set_binary) save_npz_data(args.path+"toronto_train_set_rawRating.npz", train_set_rawRating) save_npz_data(args.path+"toronto_test_set_rawRating.npz", train_set_rawRating) save_npz_data(args.path+"toronto_entire_set_binary.npz", entire_set_binary) save_npz_data(args.path+"toronto_entire_set_rawRating.npz", entire_set_raw) #To compute item similarity using IK IK_matrix_train = get_I_K(train_df) IK_matrix_entire = get_I_K(rating_df) # Get item similarity item_IK_model_train = KNN() item_IK_model_train.fit(X=IK_matrix_train.T) sparse_item_similarity_train = item_IK_model_train.get_weights() save_npz_data(args.path+"item_similarity_train.npz", sparse_item_similarity_train) item_IK_model_entire = KNN() item_IK_model_entire.fit(X=IK_matrix_entire.T) sparse_item_similarity_entire = item_IK_model_entire.get_weights() save_npz_data(args.path+"item_similarity_entire.npz", sparse_item_similarity_entire) # Get user similarity for train set user_model_trainBinary = KNN() user_model_trainBinary.fit(X=train_set_binary) sparse_user_similarity_train = user_model_trainBinary.get_weights() save_npz_data(args.path+"user_similarity_trainSet.npz", sparse_user_similarity_train) user_model_entireBinary = KNN() user_model_entireBinary.fit(X=entire_set_binary) sparse_user_similarity_entire = user_model_entireBinary.get_weights() save_npz_data(args.path+"user_similarity_entireSet.npz", sparse_user_similarity_entire)
def classifier_train(model, device, train_dataset, optimizer, criterion, epoch, batch_size=100): total_loss = 0 model.to(device) knn = KNN().to(device) for b in range(1): batch_dist = torch.zeros(10).to(device) for item_idx, (img, _) in enumerate(random.sample(list(train_dataset), batch_size)): # send to device img = img.to(device).unsqueeze(0) # forward pass optimizer.zero_grad() img_embedding, _, img_class = model(img) print(img_class) # get samples samples = random.sample(list(train_dataset), 1000) for s in range(len(samples)): s_img, _ = samples[s] embedding, _, s_class = model(s_img.to(device).unsqueeze(0)) samples[s] = (embedding, s_class) # calculate loss # intial loss from model forward to one-hot desired_one_hot = torch.zeros(img_class.shape, dtype=torch.float).to(device) desired_one_hot[0][torch.argmax(img_class, dim=1).item()] = 1 #desired_one_hot= torch.tensor([torch.argmax(img_class, dim=1).item()]).to(device) #invert_one_hot = torch.ones(img_class.shape, dtype=torch.float).to(device) #invert_one_hot[0][torch.argmax(img_class, dim=1).item()] = 0 #loss = torch.nn.functional.mse_loss(img_class, desired_one_hot) #print(loss.item()) loss = 0 # generate stochastic nearest KNN for closest encodings neighbors = knn(img_embedding, samples, direction=1) n_loss = 0 for (n_embed, n_class) in neighbors: #difference_output = torch.sub(img_embedding, n_embed) #loss += torch.nn.functional.mse_loss(difference_output, torch.zeros(*difference_output.shape).to(device)) #loss += torch.nn.functional.binary_cross_entropy_with_logits(img_class, n_class.detach()) #n_loss += torch.nn.functional.cross_entropy(n_class, desired_one_hot) n_loss += torch.nn.functional.binary_cross_entropy(n_class, desired_one_hot) print(n_loss.item()) neighbors = knn(img_embedding, samples, direction=-1) f_loss = 0 def random_nn(): exclude=[torch.argmax(img_class, dim=1).item()] randInt = random.randint(0,9) return random_nn() if randInt in exclude else randInt out = torch.zeros(img_class.shape).to(device) out[0][random_nn()] = 1 for (n_embed, n_class) in neighbors: #difference_output = torch.sub(img_embedding, n_embed) #loss += torch.nn.functional.mse_loss(difference_output, torch.zeros(*difference_output.shape).to(device)) f_loss -= 2 * torch.nn.functional.binary_cross_entropy(n_class, desired_one_hot) #f_loss += torch.nn.functional.cross_entropy(n_class, out) f_loss += torch.nn.functional.binary_cross_entropy(n_class, out) #f_loss += 1.5 * torch.nn.functional.mse_loss(n_class, 2 * invert_one_hot) print(f_loss.item()) loss += f_loss + n_loss loss.backward() optimizer.step() total_loss += loss.item() # batch entropy batch_dist[torch.argmax(img_class, dim=1).item()]+=1 #if item_idx % 100 == 0: print(f"Epoch: {epoch} - Episode: {(b * 10) + item_idx} - Loss: {loss.item()}") optimizer.zero_grad() print(batch_dist) loss = -100 * batch_size * torch.nn.functional.kl_div(batch_dist/batch_size, torch.ones(batch_dist.shape).to(device)/batch_size) print(loss.item()) loss.requires_grad = True loss.backward() optimizer.step() total_loss += loss.item() return total_loss/len(train_dataset)
# 2. Pretrain Model Encoder on Augmentations model = MNIST_Classifier().to("cuda") optimizer = torch.optim.RMSprop(list(model.encoder.parameters()) + list(model.decoder.parameters()), lr=1e-3) criterion = torch.nn.MSELoss() model.load_state_dict(torch.load("results/encoder_pretraining_100.pth")) model.eval() # 3. Generate KNN for random sample x, _ = random.sample(list(test_def), 1)[0] x_embed, _, s_class = model(x.to("cuda").unsqueeze(0)) knn = KNN() samples = random.sample(list(test_def), 10000) with torch.no_grad(): for s in range(len(samples)): s_img, _ = samples[s] embedding, _, s_class = model(s_img.to("cuda").unsqueeze(0)) samples[s] = (embedding, s_class, s_img) # generate stochastic KNN for encodings neighbors = knn(x_embed, samples) fig, ax = plt.subplots(1, 6, constrained_layout=True) np.vectorize(lambda ax: ax.axis('off'))(ax) ax[0].imshow(x.squeeze(0))
from sklearn.datasets import load_boston from validation import classification as val from models.decision_tree import DecisionTree from models.knn import KNN from models.random_forest import RandomForest from preprocessing.features_enginering import normalize_dataset from preprocessing.split import train_test_split from validation.regression import sqrderr # %% X, y = load_boston(return_X_y=True) normalize_dataset(X) x_train, y_train, x_test, y_test = train_test_split(X, y, .8) kclass = KNN(5, mode="regression") kclass.fit(x_train, y_train) res = kclass.predict(x_test) knn_err = sqrderr(res, y_test) kclass_w = KNN(5, mode="regression", method="weighted") kclass_w.fit(x_train, y_train) res = kclass_w.predict(x_test) knn_w_err = sqrderr(res, y_test) ## %% forest_w = RandomForest(mode="regression", errfun="mse",
def main(): # Read file names parser = argparse.ArgumentParser() parser.add_argument("xTrain", help="filename for features of the training data") parser.add_argument( "yTrain", help="filename for labels associated with training data") parser.add_argument("xTest", help="filename for features of the test data") args = parser.parse_args() # load the train and test data assumes you'll use numpy xTrain = pd.read_csv(args.xTrain) yTrain = pd.read_csv(args.yTrain) xTest = pd.read_csv(args.xTest) colNames = list(xTrain.keys()) # visualize(xTrain, yTrain, colNames) models = { 'boost': Boost(5, .2, 5), 'dt': DT(25, 1, 'entropy'), 'knn': KNN(1), 'nb': NB(), 'rf': RF(51, 25, 'gini', 25, 1), 'svm': SVM(.1, 'poly', 3, .01) } X = xTrain.to_numpy() Y = yTrain.to_numpy() basePreds = [] for k in models: models[k].train(X, Y) basePreds.append(list(models[k].predict(xTrain.to_numpy()))) basePreds = np.array(basePreds) basePreds = np.transpose(basePreds) metalearner = Boost(5, .2, 5) nfolds = 3 kf = KFold(nfolds) trIndices = [] tsIndices = [] for tr, ts in kf.split(X): trIndices.append(tr) tsIndices.append(ts) total = 0 for i in range(nfolds): metalearner.train(X[trIndices[i], :], Y[trIndices[i], :]) acc = metalearner.predAcc(X[tsIndices[i], :], Y[tsIndices[i], :]) total += acc / nfolds print("ACC: ", total) metalearner.train(X, Y) testPreds = metalearner.predict(xTest.to_numpy()) finalPreds = np.array([list(range(len(xTest))), testPreds]).transpose() finalPreds = pd.DataFrame(finalPreds, columns=['Id', 'Cover_Type']) finalPreds.to_csv('finalPredictions.csv', index=False) # print(finalPreds) freq = Counter(list(testPreds)) labelMap = { 1: 'Spruce/Fir', 2: 'Lodgepole Pine', 3: 'Ponderosa Pine', 4: 'Cottonwood/Willow', 5: 'Aspen', 6: 'Douglas-fir', 7: 'Krummholz' } label = [labelMap[k] for k in freq.keys()] no_trees = [freq[k] for k in freq.keys()] index = np.arange(len(label)) plt.bar(index, no_trees) plt.xlabel('Cover type', fontsize=12) plt.ylabel('Number of samples', fontsize=12) plt.xticks(index, label, fontsize=12, rotation=30) plt.title('Class Frequency in prediction') plt.show() return