def cross_validate(self, dataset, num_folds): self.rho_list = [0.1] * 12 self.alpha_list = [7.3476104736328125, 6.8925933837890625, 8.2190093994140625, 8.2137908935546875, 8.2814483642578125, 7.9523162841796875, 7.9453582763671875, 6.6873321533203125, 7.2606353759765625, 7.0821990966796875, 7.1634979248046875, 8.4375] if 1: dataset.createFolds(num_folds) for rho in GlmNet.rho_values: print "rho: %f" % (rho) best_rmsle_list = [float("inf")] * 12 min_alpha_list = np.array([self.min_alpha] * 12) max_alpha_list = np.array([self.max_alpha] * 12) rho_list = [rho] * 12 self.search(dataset, num_folds, self.max_depth, self.max_width, min_alpha_list, max_alpha_list, rho_list) def train(self, dataset): if self.debug: print "Training elastic net..." self._train(dataset, self.alpha_list, self.rho_list) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.regressor_list[month_ind].predict(features) return predictions LearnerBase.register(GlmNet)
self._train(fold_train, k) cur_score.addFold(fold_test.getSales(), self.predict(fold_test)) for month_ind in range(12): cur_rmsle = cur_score.getRMSLE(month_ind) if cur_rmsle < best_rmsle_list[month_ind]: best_rmsle_list[month_ind] = cur_rmsle best_k_list[month_ind] = k self.k_list = list(best_k_list) if self.debug: print "Best k-values by month: %s" % str(self.k_list) def train(self, dataset): self._train(dataset, None, k_list=self.k_list) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.knn_list[month_ind].predict(features) return predictions LearnerBase.register(NearestNeighbor)
def predict(self, x): """ TODO fix this somehow. """ return 0.0 def grade(self, features, essay_set, domain, options={}): """Return an integer grade for each feature vector in the specified array.""" f = open('output/ds.set%d.dom%d.%s.matOut' % (essay_set, domain, options['postfix']), 'r') scores = list() for line in f.readlines(): scores.append(float(line)) if options['round']: grades = [int(round(score)) for score in scores] else: grades = [self.curve.curve(score) for score in scores] return np.asarray(grades) def set_curve(self, scores, grade_counts): """Set curve with histogram.""" self.curve = Curve(scores, histogram=grade_counts) LearnerBase.register(MatlabExample)
fold_train = dataset.getTrainFold(fold_ind) fold_test = dataset.getTestFold(fold_ind) self.rf = RandomForestClassifier(n_estimators=n, min_split=split) self.rf.fit(fold_train.getFeatures(), fold_train.getLabels()) fold_score = Score.Score(fold_test.getLabels(), self.predict(fold_test.getFeatures())) prediction_inds = dataset.getTestFoldInds(fold_ind) learner_predictions[prediction_inds] = self.predict(fold_test.getFeatures()) cur_score = Score.Score(dataset.getLabels(), learner_predictions).getLogLoss() if cur_score < best_score: if self.debug: print "Achieved new best score %f" % cur_score best_params = (n, split) best_score = cur_score self.n_estimators, self.min_split = best_params def train(self, features, labels): if self.debug: print "Training random forest with n_estimators=%d, min_split=%d" % (self.n_estimators, self.min_split) self.rf = RandomForestClassifier(n_estimators=self.n_estimators, min_split=self.min_split) self.rf.fit(features, labels) def predict(self, features): num_samples, num_features = features.shape probs = [prob[1] for prob in self.rf.predict_proba(features)] return np.minimum(np.maximum(probs, 0.01 * np.ones(num_samples)), 0.99 * np.ones(num_samples)) LearnerBase.register(RandomForest)
print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples) A = np.hstack((month_features, np.ones((num_samples,1)))) month_params, residues, rank, s = linalg.lstsq(A, dataset.getSalesForMonth(month_ind)) params[:, month_ind] = month_params self.params = params def predict(self, dataset): features = dataset.getQuantitativeFeatures() num_samples, num_features = features.shape A = np.hstack((features, np.ones((num_samples,1)))) sales = A.dot(self.params) return np.maximum(sales, np.zeros(sales.shape)) LearnerBase.register(QuantLinearRegression)
if cur_score < best_score: if self.debug: print "Achieved new best score %f" %cur_score best_k = k best_score = cur_score self.k = best_k def train(self, features, labels): self._train_with_k(features, labels, self.k) def predict(self, features): num_samples, num_features = features.shape A = np.hstack((features, np.ones((num_samples,1)))) probs = A.dot(self.params) return np.minimum(np.maximum(probs, 0.01*np.ones(num_samples)), 0.99*np.ones(num_samples)) LearnerBase.register(RidgeRegression)
if self.debug: print "Train SuportVectorMachines with %d features..." % ( dataset.getNumFeatures()) self.svr_list = [] for month_ind in range(12): month_features = dataset.getFeaturesForMonth(month_ind) if self.debug: num_samples = month_features.shape[0] print "Learning on month %d of 12 with %d samples..." % ( month_ind + 1, num_samples) svr = SVR() svr.fit(month_features, dataset.getSalesForMonth(month_ind)) self.svr_list.append(svr) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.svr_list[month_ind].predict(features) return predictions LearnerBase.register(SupportVectorMachines)
self.best_rmsle_list = [float("inf")] * 12 self.min_n_estimators_list = np.array([self.min_n_estimators] * 12) self.max_n_estimators_list = np.array([self.max_n_estimators] * 12) self.search( dataset, num_folds, self.max_depth, self.max_width, self.min_n_estimators_list, self.max_n_estimators_list, ) def train(self, dataset): if self.debug: print "Training GradientBoosting model with %d features..." % (dataset.getNumFeatures()) self._train(dataset, self.n_estimators_list) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.regressor_list[month_ind].predict(features) return predictions LearnerBase.register(GradientBoosting)
def cross_validate(self, dataset, num_folds): pass def train(self, dataset): if self.debug: print "Train SuportVectorMachines with %d features..." %(dataset.getNumFeatures()) self.svr_list = [] for month_ind in range(12): month_features = dataset.getFeaturesForMonth(month_ind) if self.debug: num_samples = month_features.shape[0] print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples) svr = SVR() svr.fit(month_features, dataset.getSalesForMonth(month_ind)) self.svr_list.append(svr) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.svr_list[month_ind].predict(features) return predictions LearnerBase.register(SupportVectorMachines)
if self.debug: print "Running linear regression with %d quantitative features..." % ( num_features) for month_ind in range(12): month_features = dataset.getQuantitativeFeaturesForMonth(month_ind) num_samples = month_features.shape[0] if self.debug: print "Learning on month %d of 12 with %d samples..." % ( month_ind + 1, num_samples) A = np.hstack((month_features, np.ones((num_samples, 1)))) month_params, residues, rank, s = linalg.lstsq( A, dataset.getSalesForMonth(month_ind)) params[:, month_ind] = month_params self.params = params def predict(self, dataset): features = dataset.getQuantitativeFeatures() num_samples, num_features = features.shape A = np.hstack((features, np.ones((num_samples, 1)))) sales = A.dot(self.params) return np.maximum(sales, np.zeros(sales.shape)) LearnerBase.register(QuantLinearRegression)
for split in RandomForest.split_values: print "Split: %d" % (split) self.min_n_estimators_list = np.array([0] * 12) self.max_n_estimators_list = np.array([150] * 12) self.search(dataset, num_folds, self.max_depth, self.max_width, self.min_n_estimators_list, self.max_n_estimators_list, [split] * 12) def train(self, dataset): if self.debug: print "Training random forest with n_estimators_list: %s, min_samples_split_list: %s" % ( str(self.n_estimators_list), str(self.min_samples_split_list)) self._train(dataset, self.n_estimators_list, self.min_samples_split_list) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.regressor_list[month_ind].predict( features) return predictions LearnerBase.register(RandomForest)
print "Training SVM regression with C=%d, poly_degree=%d" %(self.C, self.poly_degree) self._train_with_values(dataset, poly_degree=self.poly_degree, C=self.C) def predict(self, dataset): assert self.svm_list is not None self._format_test_data(dataset) num_samples = dataset.getNumSamples() num_features = dataset.getNumFeatures() predictions = np.zeros((num_samples, 12)) for month_ind in range(12): # import pdb;pdb.set_trace() predictions[:, month_ind] = svmlight.classify(self.svm_list[month_ind], self.formatted_data) return predictions LearnerBase.register(SVMRegression)
for fold_ind in range(num_folds): fold_train = dataset.getTrainFold(fold_ind) fold_test = dataset.getTestFold(fold_ind) self._train(fold_train, k) cur_score.addFold(fold_test.getSales(), self.predict(fold_test)) for month_ind in range(12): cur_rmsle = cur_score.getRMSLE(month_ind) if cur_rmsle < best_rmsle_list[month_ind]: best_rmsle_list[month_ind] = cur_rmsle best_k_list[month_ind] = k self.k_list = list(best_k_list) if self.debug: print "Best k-values by month: %s" %str(self.k_list) def train(self, dataset): self._train(dataset, None, k_list=self.k_list) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.knn_list[month_ind].predict(features) return predictions LearnerBase.register(NearestNeighbor)
def cross_validate(self, dataset, num_folds): pass def train(self, dataset): if self.debug: print "Train SuportVectorMachines with %d features..." %(dataset.getNumFeatures()) self.regressor_list = [] for month_ind in range(12): month_features = dataset.getFeaturesForMonth(month_ind) if self.debug: num_samples = month_features.shape[0] print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples) regressor = DecisionTreeRegressor() regressor.fit(month_features, dataset.getSalesForMonth(month_ind)) self.regressor_list.append(regressor) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.regressor_list[month_ind].predict(features) return predictions LearnerBase.register(DecisionTree)
] if 1: dataset.createFolds(num_folds) self.best_rmsle_list = [float("inf")] * 12 self.min_n_estimators_list = np.array([self.min_n_estimators] * 12) self.max_n_estimators_list = np.array([self.max_n_estimators] * 12) self.search(dataset, num_folds, self.max_depth, self.max_width, self.min_n_estimators_list, self.max_n_estimators_list) def train(self, dataset): if self.debug: print "Training GradientBoosting model with %d features..." % ( dataset.getNumFeatures()) self._train(dataset, self.n_estimators_list) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.regressor_list[month_ind].predict( features) return predictions LearnerBase.register(GradientBoosting)
print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples) A = np.hstack((month_features, np.ones((num_samples,1)))) month_params, residues, rank, s = linalg.lstsq(A, dataset.getSalesForMonth(month_ind)) params[:, month_ind] = month_params self.params = params def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape A = np.hstack((features, np.ones((num_samples,1)))) sales = A.dot(self.params) return np.maximum(sales, np.zeros(sales.shape)) LearnerBase.register(LinearRegression)
if 1: dataset.createFolds(num_folds) for rho in GlmNet.rho_values: print "rho: %f" % (rho) best_rmsle_list = [float("inf")] * 12 min_alpha_list = np.array([self.min_alpha] * 12) max_alpha_list = np.array([self.max_alpha] * 12) rho_list = [rho] * 12 self.search(dataset, num_folds, self.max_depth, self.max_width, min_alpha_list, max_alpha_list, rho_list) def train(self, dataset): if self.debug: print "Training elastic net..." self._train(dataset, self.alpha_list, self.rho_list) def predict(self, dataset): features = dataset.getFeatures() num_samples, num_features = features.shape predictions = np.zeros((num_samples, 12)) for month_ind in range(12): predictions[:, month_ind] = self.regressor_list[month_ind].predict( features) return predictions LearnerBase.register(GlmNet)