from sklearn import linear_model from sklearn import model_selection from sklearn import svm # Load dataset names = ['QIN', 'TIN', 'pHIN', 'CondIN', 'CODIN', 'SSIN', 'BOD5IN'] dataset = pandas.read_csv('aritma.csv', names=names) # Split-out validation dataset x = dataset.iloc[:, :-1].values y = dataset.iloc[:, 1].values test_rate = 0.25 xTrain, xTest, yTrain, yTest = model_selection.train_test_split(x, y, test_size=test_rate, random_state=0) classifiers = [('SVM', svm.SVR()), ('SGDRegressor', linear_model.SGDRegressor()), ('PassiveAggressiveRegressor', linear_model.PassiveAggressiveRegressor())] for name, model in classifiers: clf = model clf.fit(xTrain, yTrain) yPredicted = clf.predict(xTest) plt.figure() print("yPredicted", yPredicted) plt.title(name + ' Test' + ' Score: ' + str(clf.score(xTrain, yTrain))) plt.plot(yPredicted, color='red', label='predicted', marker='.') plt.plot(yTest, color='blue', label='Actual', marker='*') plt.legend() plt.savefig(name + '_Test' + '_plot.svg', dpi=300) file = open('modeller.txt', 'a') file.write("\n\nModel Adı: {}, \n {}".format(name, model)) file.close()
X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename="PasAggC", setused=setused, tag="2") #%% # Level 2 Score: clf = linear_model.PassiveAggressiveRegressor(n_iter=100, random_state=rnd, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename="PasAggR", setused=setused, tag="1") #%%
def generate_prediction(cls, race): """Generate a prediction for the specified race""" prediction = { 'race_id': race['_id'], 'earliest_date': cls.get_earliest_date(), 'prediction_version': cls.PREDICTION_VERSION, 'seed_version': Seed.SEED_VERSION, 'results': None, 'score': None, 'train_seeds': None, 'test_seeds': None, 'estimator': None } predictor = None generate_predictor = False segment = tuple(race['entry_conditions']) + tuple( [race['track_condition']]) with cls.predictor_cache_lock: if segment in cls.predictor_cache: predictor = cls.predictor_cache[segment] else: cls.predictor_cache[segment] = None generate_predictor = True if generate_predictor: similar_races = pyracing.Race.find({ 'entry_conditions': race['entry_conditions'], 'track_condition': race['track_condition'], 'start_time': { '$lt': race.meet['date'] } }) if len(similar_races) >= (1 / cls.TEST_SIZE): try: train_races, test_races = cross_validation.train_test_split( similar_races, test_size=cls.TEST_SIZE) train_X = [] train_y = [] for train_race in train_races: for seed in train_race.seeds: if seed['result'] is not None: train_X.append(seed.normalized_data) train_y.append(seed['result']) test_X = [] test_y = [] for test_race in test_races: for seed in test_race.seeds: if seed['result'] is not None: test_X.append(seed.normalized_data) test_y.append(seed['result']) predictor = { 'classifier': None, 'score': None, 'train_seeds': len(train_y), 'test_seeds': len(test_y), 'estimator': None } dual = len(train_X) < len(train_X[0]) kernel = 'linear' loss = 'epsilon_insensitive' if not dual: loss = 'squared_epsilon_insensitive' for estimator in ( linear_model.BayesianRidge(), linear_model.ElasticNet(), linear_model.LinearRegression(), linear_model.LogisticRegression(), linear_model.OrthogonalMatchingPursuit(), linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(), linear_model.Ridge(), linear_model.SGDRegressor(), svm.SVR(kernel=kernel), svm.LinearSVR(dual=dual, loss=loss), svm.NuSVR(kernel=kernel), tree.DecisionTreeRegressor(), tree.ExtraTreeRegressor()): logging.debug( 'Trying {estimator} for {segment}'.format( estimator=estimator.__class__.__name__, segment=segment)) try: classifier = pipeline.Pipeline([ ('feature_selection', feature_selection.SelectFromModel( estimator, 'mean')), ('regression', estimator) ]) classifier.fit(train_X, train_y) score = classifier.score(test_X, test_y) if predictor['classifier'] is None or predictor[ 'score'] is None or score > predictor[ 'score']: logging.debug( 'Using {estimator} ({score}) for {segment}' .format( estimator=estimator.__class__.__name__, score=score, segment=segment)) predictor['classifier'] = classifier predictor['score'] = score predictor[ 'estimator'] = estimator.__class__.__name__ except BaseException as e: logging.debug( 'Caught exception while trying {estimator} for {segment}: {exception}' .format(estimator=estimator.__class__.__name__, segment=segment, exception=e)) continue cls.predictor_cache[segment] = predictor except: del cls.predictor_cache[segment] raise else: del cls.predictor_cache[segment] else: while predictor is None: try: predictor = cls.predictor_cache[segment] time.sleep(10) except KeyError: break if predictor is not None: reverse = False if 'score' in predictor and predictor['score'] is not None: reverse = predictor['score'] < 0 prediction['score'] = abs(predictor['score']) if 'classifier' in predictor and predictor[ 'classifier'] is not None: raw_results = {} for seed in race.seeds: raw_result = predictor['classifier'].predict( numpy.array(seed.normalized_data).reshape(1, -1))[0] if raw_result is not None: if not raw_result in raw_results: raw_results[raw_result] = [] raw_results[raw_result].append(seed.runner['number']) for key in sorted(raw_results.keys(), reverse=reverse): if prediction['results'] is None: prediction['results'] = [] prediction['results'].append( sorted([number for number in raw_results[key]])) if 'train_seeds' in predictor: prediction['train_seeds'] = predictor['train_seeds'] if 'test_seeds' in predictor: prediction['test_seeds'] = predictor['test_seeds'] if 'estimator' in predictor: prediction['estimator'] = predictor['estimator'] return prediction
regression(linear_model.ARDRegression()), regression(linear_model.BayesianRidge()), regression(linear_model.ElasticNet(random_state=RANDOM_SEED)), regression(linear_model.ElasticNetCV(random_state=RANDOM_SEED)), regression(linear_model.HuberRegressor()), regression(linear_model.Lars()), regression(linear_model.LarsCV()), regression(linear_model.Lasso(random_state=RANDOM_SEED)), regression(linear_model.LassoCV(random_state=RANDOM_SEED)), regression(linear_model.LassoLars()), regression(linear_model.LassoLarsCV()), regression(linear_model.LassoLarsIC()), regression(linear_model.LinearRegression()), regression(linear_model.OrthogonalMatchingPursuit()), regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.PassiveAggressiveRegressor( random_state=RANDOM_SEED)), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)), # Logistic Regression classification(linear_model.LogisticRegression( random_state=RANDOM_SEED)), classification(linear_model.LogisticRegressionCV( random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.LogisticRegression(
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None): from sklearn import datasets, neighbors, linear_model, svm totalTime = 0 startTrainTime = time() logger.info("Start training...") if model_type == 'ARDRegression': model = linear_model.ARDRegression().fit(train_x, train_y) elif model_type == 'BayesianRidge': model = linear_model.BayesianRidge().fit(train_x, train_y) elif model_type == 'ElasticNet': model = linear_model.ElasticNet().fit(train_x, train_y) elif model_type == 'ElasticNetCV': model = linear_model.ElasticNetCV().fit(train_x, train_y) elif model_type == 'HuberRegressor': model = linear_model.HuberRegressor().fit(train_x, train_y) elif model_type == 'Lars': model = linear_model.Lars().fit(train_x, train_y) elif model_type == 'LarsCV': model = linear_model.LarsCV().fit(train_x, train_y) elif model_type == 'Lasso': model = linear_model.Lasso().fit(train_x, train_y) elif model_type == 'LassoCV': model = linear_model.LassoCV().fit(train_x, train_y) elif model_type == 'LassoLars': model = linear_model.LassoLars().fit(train_x, train_y) elif model_type == 'LassoLarsCV': model = linear_model.LassoLarsCV().fit(train_x, train_y) elif model_type == 'LassoLarsIC': model = linear_model.LassoLarsIC().fit(train_x, train_y) elif model_type == 'LinearRegression': model = linear_model.LinearRegression().fit(train_x, train_y) elif model_type == 'LogisticRegression': model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'LogisticRegressionCV': model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'MultiTaskLasso': model = linear_model.MultiTaskLasso().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNet': model = linear_model.MultiTaskElasticNet().fit(train_x, train_y) elif model_type == 'MultiTaskLassoCV': model = linear_model.MultiTaskLassoCV().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNetCV': model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuit': model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuitCV': model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y) elif model_type == 'PassiveAggressiveClassifier': model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'PassiveAggressiveRegressor': model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y) elif model_type == 'Perceptron': model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RandomizedLasso': model = linear_model.RandomizedLasso().fit(train_x, train_y) elif model_type == 'RandomizedLogisticRegression': model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y) elif model_type == 'RANSACRegressor': model = linear_model.RANSACRegressor().fit(train_x, train_y) elif model_type == 'Ridge': model = linear_model.Ridge().fit(train_x, train_y) elif model_type == 'RidgeClassifier': model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeClassifierCV': model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeCV': model = linear_model.RidgeCV().fit(train_x, train_y) elif model_type == 'SGDClassifier': model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SGDRegressor': model = linear_model.SGDRegressor().fit(train_x, train_y) elif model_type == 'TheilSenRegressor': model = linear_model.TheilSenRegressor().fit(train_x, train_y) elif model_type == 'lars_path': model = linear_model.lars_path().fit(train_x, train_y) elif model_type == 'lasso_path': model = linear_model.lasso_path().fit(train_x, train_y) elif model_type == 'lasso_stability_path': model = linear_model.lasso_stability_path().fit(train_x, train_y) elif model_type == 'logistic_regression_path': model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'orthogonal_mp': model = linear_model.orthogonal_mp().fit(train_x, train_y) elif model_type == 'orthogonal_mp_gram': model = linear_model.orthogonal_mp_gram().fit(train_x, train_y) elif model_type == 'LinearSVC': model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SVC': model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y) else: raise NotImplementedError('Model not implemented') logger.info("Finished training.") endTrainTime = time() trainTime = endTrainTime - startTrainTime logger.info("Training time : %d seconds" % trainTime) logger.info("Start predicting train set...") train_pred_y = model.predict(train_x) logger.info("Finished predicting train set.") logger.info("Start predicting test set...") test_pred_y = model.predict(test_x) logger.info("Finished predicting test set.") endTestTime = time() testTime = endTestTime - endTrainTime logger.info("Testing time : %d seconds" % testTime) totalTime += trainTime + testTime train_pred_y = np.round(train_pred_y) test_pred_y = np.round(test_pred_y) np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i') logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y))) logger.info('[TEST] Acc: %.3f' % (accuracy_score(test_y, test_pred_y))) return accuracy_score(test_y, test_pred_y)
def predict(train_list, train_result, test_list, method_list, **kwargs): def fit_predict_each_output(model, target): __predict_result = [] for idx in range(np.size(target, 1)): model.fit(train_list, target[:, idx]) __predict_result.append(model.predict(test_list)) return np.transpose(np.asarray(__predict_result)) def fit_predict(model, target): model.fit(train_list, target) return model.predict(test_list) from_bins_idx = kwargs["from_bins_idx"] to_bins_idx = kwargs["to_bins_idx"] _binned_train_result = to_bins_idx(train_result) _predict_result = [] if "current" in method_list: rbm = neural_network.BernoulliRBM(n_components=512, verbose=False, n_iter=100, learning_rate=1e-2, random_state=0) rbm.fit(train_list) rbm.fit(test_list) _predict_result.append(np.transpose(np.asarray(__predict_result))) elif "knn" in method_list: _ = knn_predict(train_list, _binned_train_result, test_list, k=kwargs["k"]) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "dt" in method_list: _ = fit_predict(tree.DecisionTreeClassifier(max_depth=kwargs["max_depth"]), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "rf" in method_list: _ = fit_predict(ensemble.RandomForestClassifier(n_estimators=kwargs["n_estimators"], max_depth=kwargs["max_depth"], n_jobs=kwargs["n_jobs"]), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "average" in method_list: _ = average_predict(train_result, test_list) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "adaboost" in method_list: _ = fit_predict_each_output(ensemble.AdaBoostClassifier(), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "ridge" in method_list: _ = fit_predict_each_output(linear_model.RidgeClassifier(), _binned_train_result) _predict_result.append(from_bins_idx(np.asarray(_, dtype=int))) elif "linear" in method_list: _predict_result.append(fit_predict_each_output(linear_model.LinearRegression(), train_result)) elif "huber" in method_list: _predict_result.append(fit_predict_each_output(linear_model.HuberRegressor(), train_result)) elif "theilsen" in method_list: _predict_result.append(fit_predict_each_output(linear_model.TheilSenRegressor(), train_result)) elif "lasso" in method_list: _predict_result.append(fit_predict_each_output(linear_model.Lasso(), train_result)) elif "par" in method_list: _predict_result.append(fit_predict_each_output(linear_model.PassiveAggressiveRegressor(C=kwargs["par_C"], epsilon=kwargs["par_eps"]), train_result)) elif "ridge_reg" in method_list: _predict_result.append(fit_predict_each_output(linear_model.Ridge(), train_result)) elif "dt_reg" in method_list: _predict_result.append(fit_predict(tree.DecisionTreeRegressor(max_depth=kwargs["max_depth"]), train_result)) elif "rf_reg" in method_list: _predict_result.append(fit_predict(ensemble.RandomForestRegressor(max_depth=kwargs["max_depth"], n_jobs=kwargs['n_jobs'], n_estimators=kwargs['n_estimators']), train_result)) elif "xgboost" in method_list: _predict_result.append(fit_predict_each_output(xgb.XGBClassifier(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), _binned_train_result)) elif "xgboost_reg" in method_list: _predict_result.append(fit_predict_each_output(xgb.XGBRegressor(max_depth=kwargs["max_depth"], n_estimators=kwargs['n_estimators'], nthread=kwargs["nthread"]), train_result)) elif "svr" in method_list: _predict_result.append(fit_predict_each_output(svm.SVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result)) elif "linear_svr" in method_list: _predict_result.append(fit_predict_each_output(svm.LinearSVR(C=kwargs["C"], epsilon=kwargs["epsilon"]), train_result)) else: assert False, "invalid method" return np.asarray(_predict_result)
regr.fit(trainingInstances, trainingLables) predictedRewards = regr.predict(testInstances) predictedRewardsList[2].append(predictedRewards) # printer.PrintProsentageOfFail(predictedRewards, actualDuration) # Fourth Test - Passive Aggressive # ------------------------------ testName4Reg = "Fourth Test - Passive Aggressive" regr = linear_model.PassiveAggressiveRegressor(C=1.0, fit_intercept=True, max_iter=1000, tol=0.002, early_stopping=False, validation_fraction=0.1, n_iter_no_change=100, shuffle=True, verbose=0, loss='epsilon_insensitive', epsilon=0.1, random_state=None, warm_start=False, average=False) regr.fit(trainingInstances, trainingLables) predictedRewards = regr.predict(testInstances) predictedRewardsList[3].append(predictedRewards) # printer.PrintProsentageOfFail(predictedRewards, actualDuration) # Fifth Test - DecisionTree # -------------------------- testName5Reg = "Fifth Test - DecisionTree"
r3 = ensemble.AdaBoostRegressor(random_state=0, loss='linear', learning_rate=3.0, n_estimators=700) r4 = ensemble.GradientBoostingRegressor() r5 = ensemble.BaggingRegressor() # overfitting r6 = ensemble.ExtraTreesRegressor() # overfitting r7 = linear_model.BayesianRidge(normalize=True) r8 = linear_model.ARDRegression(normalize=True) r9 = linear_model.HuberRegressor() r10 = linear_model.Lasso(random_state=0, selection='cyclic', normalize=False) r11 = svm.LinearSVR(random_state=0, loss='squared_epsilon_insensitive', dual=True) r12 = gaussian_process.GaussianProcessRegressor() # overfitting r13 = linear_model.PassiveAggressiveRegressor() # takes okayisch time r14 = linear_model.RANSACRegressor() # overfitting? r15 = linear_model.SGDRegressor(shuffle=True, penalty='l1', loss='squared_epsilon_insensitive', learning_rate='invscaling', epsilon=0.1, early_stopping=False, average=True) r16 = linear_model.TheilSenRegressor() # eher Verschlechterung # r17 = neural_network.MLPRegressor() # #Unoptimized # r1 = linear_model.LinearRegression() # r2 = ensemble.RandomForestRegressor(max_depth=3, min_samples_split=2, random_state=0, n_estimators=700) # r3 = ensemble.AdaBoostRegressor(random_state=0, n_estimators=100)
print ("\n\nARD Regression 4: accuracy=", acc, "\ncoef_=", ARDReg.coef_ ) ############ Stochastic Gradient Descent Regressor ############ #paramDict = {'loss': ['squared_loss', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'penalty': ['l2', 'elasticnet', None], 'alpha': [.00001, .0001 ], 'epsilon': [.1, .05, .01], 'learning_rate': ['optimal', 'invscaling'] } SGDReg = linear_model.SGDRegressor(alpha=1e-05, penalty='l2', loss='squared_loss', learning_rate='invscaling', epsilon=0.01) SGDReg.fit(features_train, labels_train) pred = SGDReg.predict(features_test) acc = SGDReg.score(features_test, labels_test) print ("\n\nSGD Regression: accuracy=", acc, "\ncoef_=", SGDReg.coef_ ) ####### BestScore= 0.787316849183 with these best Parameters= {'alpha': 1e-05, 'penalty': 'l2', 'loss': 'squared_loss', 'learning_rate': 'invscaling', 'epsilon': 0.01} ############ Passive-Aggressive Regressor ############ #paramDict = {'C': [.1, 1, 10, 25, 50], 'epsilon': [.02, .05, .01, .001], 'n_iter': [3, 5, 10, 20], 'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'] } passReg = linear_model.PassiveAggressiveRegressor(C=25, loss='epsilon_insensitive', n_iter=10, epsilon=0.02) passReg.fit(features_train, labels_train) pred = passReg.predict(features_test) acc = passReg.score(features_test, labels_test) print ("\n\npass Regression 3: accuracy=", acc, "\ncoef_=", passReg.coef_ ) ######## BestScore= 0.79440546561 with these best Parameters= {'C': 50, 'loss': 'epsilon_insensitive', 'n_iter': 10, 'epsilon': 0.02} ############ Polynomial ############ from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline pipe2 = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', linear_model.LinearRegression(fit_intercept=False))]) pipe3 = Pipeline([('poly', PolynomialFeatures(degree=3)), ('linear', linear_model.LinearRegression(fit_intercept=False))]) pipe2.fit(features_train, labels_train) pred = pipe2.predict(features_test) acc = pipe2.score(features_test, labels_test)
def regress_sys(folder, all_videos, y, training_size, have_output=True): """ Uses regression techniques to select the best tracking parameters. Regression again intensities of input images. Parameters ---------- all_videos: list Contains prefixes of video filenames of entire video set to be tracked. Training dataset will be some subset of these videos. y: numpy array Contains manually acquired quality levels using Trackmate for the files contained in the training dataset. training_size: int Number of files in training dataset. have_output: boolean If you have already acquired the quality values (y) for the training dataset, set to True. If False, it will output the files the user will need to acquire quality values for. Returns ------- regress_object: list of sklearn regression objects. Contains list of regression objects assembled from the training datasets. Uses the mean, 10th percentile, 90th percentile, and standard deviation intensities to predict the quality parameter in Trackmate. """ tprefix = [] for i in range(0, training_size): random.seed(i + 1) tprefix.append(all_videos[random.randint(0, len(all_videos))]) if have_output is False: print("Get parameters for: {}".format(tprefix[i])) if have_output is True: # Define descriptors descriptors = np.zeros((training_size, 4)) counter = 0 for name in tprefix: pup = name.split('_')[0] local_im = name + '.tif' remote_im = "{}/{}/{}".format(folder, pup, local_im) aws.download_s3(remote_im, local_im) test_image = sio.imread(local_im) descriptors[counter, 0] = np.mean(test_image[0, :, :]) descriptors[counter, 1] = np.std(test_image[0, :, :]) descriptors[counter, 2] = np.percentile(test_image[0, :, :], 10) descriptors[counter, 3] = np.percentile(test_image[0:, :, :], 90) counter = counter + 1 # Define regression techniques X = descriptors classifiers = [ svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] regress_object = [] for item in classifiers: clf = item regress_object.append(clf.fit(X, y)) return regress_object
def run_specific_combination(test_frame, reg_type, column_list): target_feature = test_frame['Endurance_Score'] test_df = test_frame.filter(column_list, axis=1) X_train, X_test, y_train, y_test = train_test_split( test_df, target_feature.values.reshape(-1, 1), test_size=0.20, random_state=0) if reg_type == 'dt': regr = DecisionTreeRegressor(max_depth=2) elif reg_type == 'lin': regr = linear_model.LinearRegression() elif reg_type == 'ridge': regr = linear_model.Ridge(alpha=1500.0) elif reg_type == 'lasso': regr = linear_model.Lasso(alpha=10.0) elif reg_type == 'bayridge': regr = linear_model.BayesianRidge() elif reg_type == 'sgd': regr = linear_model.SGDRegressor(loss='huber') elif reg_type == 'lars': regr = linear_model.Lars(n_nonzero_coefs=np.inf) elif reg_type == 'pasagv': regr = linear_model.PassiveAggressiveRegressor(random_state=0) elif reg_type == 'kernelridge': regr = kernel_ridge.KernelRidge() elif reg_type == 'svr': regr = svm.SVR() elif reg_type == 'kneigh': regr = neighbors.KNeighborsRegressor(algorithm='kd_tree') elif reg_type == 'gauss': regr = gaussian_process.GaussianProcessRegressor() elif reg_type == 'gbr': params = { 'n_estimators': 760, 'max_depth': 4, 'min_samples_split': 3, 'learning_rate': 0.026, 'loss': 'huber' } regr = GradientBoostingRegressor(**params) elif reg_type == 'ran': regr = RandomForestRegressor(n_estimators=300, max_depth=8) elif reg_type == 'et': regr = ExtraTreesRegressor() else: return x_train_frame = X_train.copy() del x_train_frame['Title'] del x_train_frame['Artist'] regr.fit(x_train_frame, y_train.ravel()) x_test_frame = X_test.copy() del x_test_frame['Title'] del x_test_frame['Artist'] y_pred = regr.predict(x_test_frame) rmse = mean_squared_error(y_test, y_pred) score = r2_score(y_test, y_pred) print("R2-score: {}, RMSE: {}".format(score, math.sqrt(rmse))) result_df = pd.DataFrame(columns=[ 'Song', 'Artist', 'Endurance_Score', 'Predicted_Endurance_Score' ]) result_df['Song'] = X_test['Title'] result_df['Artist'] = X_test['Artist'] ### result_df['Entry_Position'] = "" result_df['Peak_Position'] = "" result_df['Total_Weeks'] = "" result_df['danceability'] = "" result_df['energy'] = "" result_df['key'] = "" result_df['loudness'] = "" result_df['mode'] = "" result_df['speechiness'] = "" result_df['acousticness'] = "" result_df['instrumentalness'] = "" result_df['liveness'] = "" result_df['valence'] = "" result_df['tempo'] = "" result_df['duration_ms'] = "" result_df['time_signature'] = "" result_df['Movies_TV_feature_count'] = "" result_df['Oscars_won'] = "" result_df['Artist_lifetime_grammy_achievement'] = "" result_df['Artist_grammy_wins'] = "" result_df['Artist_grammy_nominations'] = "" result_df['artist popularity'] = "" result_df['TopSongsArtist'] = "" result_df['TopSongsArtist10'] = "" result_df['TopSongsArtist100'] = "" result_df['Entry_Year'] = "" result_df['days_before_charting'] = "" result_df['Age_Percentage_15_30'] = "" ### result_df['Endurance_Score'] = y_test.ravel() result_df['Predicted_Endurance_Score'] = y_pred base_df = pd.read_csv('{0}/{1}.csv'.format(path_final_csv, 'final_unnormalized_dataset'), encoding='latin-1') ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Entry_Position for i, row in result_df.iterrows(): result_df.loc[i, 'Entry_Position'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Peak_Position for i, row in result_df.iterrows(): result_df.loc[i, 'Peak_Position'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Total_Weeks for i, row in result_df.iterrows(): result_df.loc[i, 'Total_Weeks'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.danceability for i, row in result_df.iterrows(): result_df.loc[i, 'danceability'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.energy for i, row in result_df.iterrows(): result_df.loc[i, 'energy'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.key for i, row in result_df.iterrows(): result_df.loc[i, 'key'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.loudness for i, row in result_df.iterrows(): result_df.loc[i, 'loudness'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.mode for i, row in result_df.iterrows(): result_df.loc[i, 'mode'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.speechiness for i, row in result_df.iterrows(): result_df.loc[i, 'speechiness'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.acousticness for i, row in result_df.iterrows(): result_df.loc[i, 'acousticness'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.instrumentalness for i, row in result_df.iterrows(): result_df.loc[i, 'instrumentalness'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.liveness for i, row in result_df.iterrows(): result_df.loc[i, 'liveness'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.valence for i, row in result_df.iterrows(): result_df.loc[i, 'valence'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.tempo for i, row in result_df.iterrows(): result_df.loc[i, 'tempo'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.duration_ms for i, row in result_df.iterrows(): result_df.loc[i, 'duration_ms'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.time_signature for i, row in result_df.iterrows(): result_df.loc[i, 'time_signature'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Movies_TV_feature_count for i, row in result_df.iterrows(): result_df.loc[i, 'Movies_TV_feature_count'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Oscars_won for i, row in result_df.iterrows(): result_df.loc[i, 'Oscars_won'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Artist_lifetime_grammy_achievement for i, row in result_df.iterrows(): result_df.loc[i, 'Artist_lifetime_grammy_achievement'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Artist_grammy_wins for i, row in result_df.iterrows(): result_df.loc[i, 'Artist_grammy_wins'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Artist_grammy_nominations for i, row in result_df.iterrows(): result_df.loc[i, 'Artist_grammy_nominations'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row['artist popularity'] for i, row in result_df.iterrows(): result_df.loc[i, 'artist popularity'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.TopSongsArtist for i, row in result_df.iterrows(): result_df.loc[i, 'TopSongsArtist'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.TopSongsArtist10 for i, row in result_df.iterrows(): result_df.loc[i, 'TopSongsArtist10'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.TopSongsArtist100 for i, row in result_df.iterrows(): result_df.loc[i, 'TopSongsArtist100'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Entry_Year for i, row in result_df.iterrows(): result_df.loc[i, 'Entry_Year'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.days_before_charting for i, row in result_df.iterrows(): result_df.loc[i, 'days_before_charting'] = results[row.Song] ####-----------------------------------------------### ####-----------------------------------------------### results = dict() for i, row in base_df.iterrows(): results[row.Title] = row.Age_Percentage_15_30 for i, row in result_df.iterrows(): result_df.loc[i, 'Age_Percentage_15_30'] = results[row.Song] ####-----------------------------------------------### result_df.to_csv('{0}/{1}.csv'.format(path_final_csv, 'predicted_finaldata'), index=False)
explained_variances, mean_absolute_errors, mean_squared_errors, mean_squared_log_errors, median_absolute_errors, r2_scores = update_list(explained_variances, mean_absolute_errors, mean_squared_errors, mean_squared_log_errors, median_absolute_errors, r2_scores) ################################################## ## Passive-agressive algorithms ## ################################################## ''' The passive-aggressive algorithms are a family of algorithms for large-scale learning. They are similar to the Perceptron in that they do not require a learning rate. However, contrary to the Perceptron, they include a regularization parameter C. Example: http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf ''' try: pa_regr = linear_model.PassiveAggressiveRegressor(random_state=0) pa_regr.fit(X_train, y_train) predictions = cross_val_predict(pa_regr, X_test, y_test, cv=6) f=open('pa_regr.pickle','wb') pickle.dump(pa_regr,f) f.close() except: print('error - PASSIVE-AGGRESSIVE') # get stats modeltypes.append('passive-agressive algorithm') explained_variances, mean_absolute_errors, mean_squared_errors, mean_squared_log_errors, median_absolute_errors, r2_scores = update_list(explained_variances, mean_absolute_errors, mean_squared_errors, mean_squared_log_errors, median_absolute_errors, r2_scores) ################################################## ## RANSAC ## ##################################################
'ardregression': lm.ARDRegression(), 'bayesianridge': lm.BayesianRidge(), 'elasticnet': lm.ElasticNet(), 'elasticnetcv': lm.ElasticNetCV(), 'huberregression': lm.HuberRegressor(), 'lars': lm.Lars(), 'larscv': lm.LarsCV(), 'lasso': lm.Lasso(), 'lassocv': lm.LassoCV(), 'lassolars': lm.LassoLars(), 'lassolarscv': lm.LassoLarsCV(), 'lassolarsic': lm.LassoLarsIC(), 'linearregression': lm.LinearRegression(), 'orthogonalmatchingpursuit': lm.OrthogonalMatchingPursuit(), 'orthogonalmatchingpursuitcv': lm.OrthogonalMatchingPursuitCV(), 'passiveagressiveregressor': lm.PassiveAggressiveRegressor(), 'ridge': lm.Ridge(), 'ridgecv': lm.RidgeCV(), 'sgdregressor': lm.SGDRegressor(), 'theilsenregressor': lm.TheilSenRegressor(), 'decisiontreeregressor': DecisionTreeRegressor(), 'randomforestregressor': RandomForestRegressor(), 'adaboostregressor': AdaBoostRegressor(), 'baggingregressor': BaggingRegressor(), 'extratreeregressor': ExtraTreeRegressor(), 'linearsvr': LinearSVR(), 'nusvr': NuSVR(), 'svr': SVR(), }
]) results2 = superfunk(linear_model.RANSACRegressor(), ndata[0], [ "Coh_Swath", "Coh_SwathOverPoca", "DayInYear_Swath", "Dist_SwathToPoca", "Heading_Swath", "LeadEdgeS_Poca", "LeadEdgeW_Poca", "PhaseConfidence_Swath", "PhaseSSegment_Swath", "PowerScaled_Swath", "PowerScaled_SwathOverPoca", "SampleNb_Swath", "SampleNb_SwathMinusLeadEdgeS", "Phase_Swath", "Phase_SwathOverPoca" ]) results3 = superfunk(linear_model.BayesianRidge(), ndata[0], [ "Coh_Swath", "Coh_SwathOverPoca", "DayInYear_Swath", "Dist_SwathToPoca", "Heading_Swath", "LeadEdgeS_Poca", "LeadEdgeW_Poca", "PhaseConfidence_Swath", "PhaseSSegment_Swath", "PowerScaled_Swath", "PowerScaled_SwathOverPoca", "SampleNb_Swath", "SampleNb_SwathMinusLeadEdgeS", "Phase_Swath", "Phase_SwathOverPoca" ]) results4 = superfunk(linear_model.PassiveAggressiveRegressor(), ndata[0], [ "Coh_Swath", "Coh_SwathOverPoca", "DayInYear_Swath", "Dist_SwathToPoca", "Heading_Swath", "LeadEdgeS_Poca", "LeadEdgeW_Poca", "PhaseConfidence_Swath", "PhaseSSegment_Swath", "PowerScaled_Swath", "PowerScaled_SwathOverPoca", "SampleNb_Swath", "SampleNb_SwathMinusLeadEdgeS", "Phase_Swath", "Phase_SwathOverPoca" ]) results5 = superfunk(linear_model.SGDRegressor(), ndata[0], [ "Coh_Swath", "Coh_SwathOverPoca", "DayInYear_Swath", "Dist_SwathToPoca", "Heading_Swath", "LeadEdgeS_Poca", "LeadEdgeW_Poca", "PhaseConfidence_Swath", "PhaseSSegment_Swath", "PowerScaled_Swath", "PowerScaled_SwathOverPoca", "SampleNb_Swath", "SampleNb_SwathMinusLeadEdgeS", "Phase_Swath", "Phase_SwathOverPoca" ]) results6 = superfunk(svm.SVR(), ndata[0], [ "Coh_Swath", "Coh_SwathOverPoca", "DayInYear_Swath", "Dist_SwathToPoca",
testRegressor( train, linear_model.Ridge(), target, 'RidgeRegression' ) testRegressor( train, linear_model.RidgeCV(alphas=[0.01, 0.1, 1.0, 2, 4, 8, 16, 32]), target, 'RidgeRegressionCV' ) testRegressor( train, linear_model.Lasso(), target, 'Lasso' ) testRegressor( train, linear_model.LassoLars(), target, 'LassoLars' ) testRegressor( train, OrthogonalMatchingPursuit(), target, 'OMP' ) # Stochastic gradient descent testRegressor( train, linear_model.SGDRegressor( loss='squared_loss' ), target, 'SGDRegressor squared loss' ) # Bayesian approaches testRegressor( train, linear_model.BayesianRidge(), target, 'BayesianRidgeRegression' ) #testRegressor( train, ARDRegression() , target, 'ARDRegression' ) testRegressor( train, linear_model.PassiveAggressiveRegressor(loss='epsilon_insensitive') , target, 'PassiveAggressiveRegressor' ) testRegressor( train, linear_model.PassiveAggressiveRegressor(loss='squared_epsilon_insensitive') , target, 'PassiveAggressiveRegressor squared loss' ) # Support Vector machines testRegressor( train, svm.SVR(kernel='poly'), target, 'SVM poly' ) testRegressor( train, svm.SVR(kernel='rbf'), target, 'SVM rbf' ) testRegressor( train, svm.SVR(kernel='sigmoid'), target, 'SVM sigmoid' ) # Nearest neighbors testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=1 ), target, 'NearestNeighbor 1' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=2 ), target, 'NearestNeighbor 2' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=3 ), target, 'NearestNeighbor 3' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=4 ), target, 'NearestNeighbor 4' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=8 ), target, 'NearestNeighbor 8' ) testRegressor( train, neighbors.KNeighborsRegressor( n_neighbors=16 ), target, 'NearestNeighbor 16' )
target_train = target[:int(.9 * n_samples)] data_test = data[int(.9 * n_samples):] target_test = target[int(.9 * n_samples):] # classfication scores print('# Classification scores:') print('KNN: %f' % neighbors.KNeighborsClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.ElasticNet: %f' % linear_model.ElasticNet().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.ElasticNetCV: %f' % linear_model.ElasticNetCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Lars: %f' % linear_model.Lars().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Lasso: %f' % linear_model.Lasso().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LassoCV: %f' % linear_model.LassoCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LassoLars: %f' % linear_model.LassoLars().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LassoLarsIC: %f' % linear_model.LassoLarsIC().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LinearRegression: %f' % linear_model.LinearRegression().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LogisticRegression: %f' % linear_model.LogisticRegression().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.OrthogonalMatchingPursuit: %f' % linear_model.OrthogonalMatchingPursuit().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.PassiveAggressiveClassifier: %f' % linear_model.PassiveAggressiveClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.PassiveAggressiveRegressor: %f' % linear_model.PassiveAggressiveRegressor().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Perceptron: %f' % linear_model.Perceptron().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Ridge: %f' % linear_model.Ridge().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.RidgeClassifier: %f' % linear_model.RidgeClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.RidgeClassifierCV: %f' % linear_model.RidgeClassifierCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.RidgeCV: %f' % linear_model.RidgeCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.SGDClassifier: %f' % linear_model.SGDClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.SGDRegressor: %f' % linear_model.SGDRegressor().fit(data_train, target_train).score(data_test, target_test)) print('naive_bayes.MultinomialNB: %f' % naive_bayes.MultinomialNB().fit(data_train, target_train).score(data_test, target_test)) print('lda.LDA: %f' % lda.LDA().fit(data_train, target_train).score(data_test, target_test)) print('svm.SVR: %f' % svm.SVR().fit(data_train, target_train).score(data_test, target_test)) print('svm.SVC: %f' % svm.SVC(kernel='linear').fit(data_train, target_train).score(data_test, target_test)) print('svm.LinearSVC: %f' % svm.LinearSVC().fit(data_train, target_train).score(data_test, target_test))
def train_passive_aggressive_regressor(): # Picking model return mp.ModelProperties( regression=True, online=True), linear_model.PassiveAggressiveRegressor()
def models(self) -> Dict[str, LinearModel]: return { "LinearRegression": linear_model.LinearRegression( ), # LinearRegression([…]) Ordinary least squares Linear Regression. "ARDRegression": linear_model.ARDRegression( ), # ARDRegression([n_iter, tol, …]) Bayesian ARD regression. "BayesianRidge": linear_model.BayesianRidge( ), # BayesianRidge([n_iter, tol, …]) Bayesian ridge regression. "HuberRegressor": linear_model.HuberRegressor( ), # HuberRegressor([epsilon, …]) Linear regression model that is robust to outliers. "OrthogonalMatchingPursuitCV": linear_model.OrthogonalMatchingPursuitCV( cv=5 ), # OrthogonalMatchingPursuitCV([…]) Cross-validated Orthogonal Matching Pursuit model (OMP). "Perceptron": linear_model.Perceptron( max_iter=1000, tol=1e-3 ), # Perceptron([penalty, alpha, …]) Read more in the User Guide. "RANSACRegressor": linear_model.RANSACRegressor( ), # RANSACRegressor([…]) RANSAC (RANdom SAmple Consensus) algorithm. "SGDRegressor": linear_model.SGDRegressor( max_iter=1000, tol=1e-3 ), # SGDRegressor([loss, penalty, …]) Linear model fitted by minimizing a regularized empirical loss with SGD "TheilSenRegressor": linear_model.TheilSenRegressor( ), # TheilSenRegressor([…]) Theil-Sen Estimator: robust multivariate regression model. "PassiveAggressiveRegressor": linear_model.PassiveAggressiveRegressor( max_iter=1000, tol=1e-3 ), # PassiveAggressiveRegressor([C, …]) Passive Aggressive Regressor "Lars": linear_model.Lars( eps=0.01 ), # Lars([fit_intercept, verbose, …]) Least Angle Regression model a.k.a. "LarsCV": linear_model.LarsCV( cv=5, eps=0.01 ), # LarsCV([fit_intercept, …]) Cross-validated Least Angle Regression model. "Lasso": linear_model.Lasso( alpha=1, max_iter=1000 ), # Lasso([alpha, fit_intercept, …]) Linear Model trained with L1 prior as regularizer (aka the Lasso) "LassoCV": linear_model.LassoCV( cv=5 ), # LassoCV([eps, n_alphas, …]) Lasso linear model with iterative fitting along a regularization path. "LassoLars": linear_model.LassoLars( eps=0.01 ), # LassoLars([alpha, …]) Lasso model fit with Least Angle Regression a.k.a. "LassoLarsCV": linear_model.LassoLarsCV( cv=5, eps=0.01, max_iter=100 ), # LassoLarsCV([fit_intercept, …]) Cross-validated Lasso, using the LARS algorithm. "LassoLarsIC": linear_model.LassoLarsIC( eps=0.01 ), # LassoLarsIC([criterion, …]) Lasso model fit with Lars using BIC or AIC for model selection "Ridge": linear_model.Ridge( ), # Ridge([alpha, fit_intercept, …]) Linear least squares with l2 regularization. "RidgeClassifier": linear_model.RidgeClassifier( ), # RidgeClassifier([alpha, …]) Classifier using Ridge regression. "RidgeClassifierCV": linear_model.RidgeClassifierCV( cv=5 ), # RidgeClassifierCV([alphas, …]) Ridge classifier with built-in cross-validation. "RidgeCV": linear_model.RidgeCV( cv=5 ), # RidgeCV([alphas, …]) Ridge regression with built-in cross-validation. "SGDClassifier": linear_model.SGDClassifier( max_iter=1000, tol=1e-3 ), # SGDClassifier([loss, penalty, …]) Linear classifiers (SVM, logistic regression, a.o.) with SGD training. "ElasticNet": linear_model.ElasticNet( ), # linear_model.ElasticNet([alpha, l1_ratio, …]) Linear regression with combined L1 and L2 priors as regularizer. "ElasticNetCV": linear_model.ElasticNetCV( cv=5 ), # linear_model.ElasticNetCV([l1_ratio, eps, …]) Elastic Net model with iterative fitting along a regularization path. ### Ignore These # "LogisticRegression": linear_model.LogisticRegression(), # LogisticRegression([penalty, …]) Logistic Regression (aka logit, MaxEnt) classifier. # "LogisticRegressionCV": linear_model.LogisticRegressionCV(cv=5), # LogisticRegressionCV([Cs, …]) Logistic Regression CV (aka logit, MaxEnt) classifier. # "MultiTaskLasso": linear_model.MultiTaskLasso(), # MultiTaskLasso([alpha, …]) Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer. # "MultiTaskElasticNet": linear_model.MultiTaskElasticNet(), # MultiTaskElasticNet([alpha, …]) Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer # "MultiTaskLassoCV": linear_model.MultiTaskLassoCV(cv=5), # MultiTaskLassoCV([eps, …]) Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer. # "MultiTaskElasticNetCV": linear_model.MultiTaskElasticNetCV(cv=5), # MultiTaskElasticNetCV([…]) Multi-task L1/L2 ElasticNet with built-in cross-validation. # "OrthogonalMatchingPursuit": linear_model.OrthogonalMatchingPursuit(), # OrthogonalMatchingPursuit([…]) Orthogonal Matching Pursuit model (OMP) # "PassiveAggressiveClassifier": linear_model.PassiveAggressiveClassifier(), # PassiveAggressiveClassifier([…]) Passive Aggressive Classifier ### Normalization seems to make the score worse! # "LinearRegressionNormalize": linear_model.LinearRegression(normalize=True), # LinearRegression([…]) Ordinary least squares Linear Regression. # "RidgeCVNormalize": linear_model.RidgeCV(cv=5, normalize=True), # RidgeCV([alphas, …]) Ridge regression with built-in cross-validation. # "LassoLarsNormalize": linear_model.LassoLars(eps=0.01, normalize=True), # LassoLars([alpha, …]) Lasso model fit with Least Angle Regression a.k.a. # "LassoLarsICNormalize": linear_model.LassoLarsIC(eps=0.01, normalize=True), # LassoLarsIC([criterion, …]) Lasso model fit with Lars using BIC or AIC for model selection # "ARDRegressionNormalize": linear_model.ARDRegression(normalize=True), # ARDRegression([n_iter, tol, …]) Bayesian ARD regression. # "BayesianRidgeNormalize": linear_model.BayesianRidge(normalize=True), # BayesianRidge([n_iter, tol, …]) Bayesian ridge regression. }
def model_comparison(): data, target = load_train() pipeline = create_pipeline() data = pipeline.fit_transform(data) MLA = [ #Ensemble Methods ensemble.AdaBoostRegressor(), ensemble.BaggingRegressor(), ensemble.ExtraTreesRegressor(), ensemble.GradientBoostingRegressor(), ensemble.RandomForestRegressor(), #Gaussian Processes gaussian_process.GaussianProcessRegressor(), #GLM linear_model.PassiveAggressiveRegressor(), linear_model.Ridge(), linear_model.Lasso(), linear_model.ElasticNet(), linear_model.SGDRegressor(), #Nearest Neighbor neighbors.KNeighborsRegressor(), #SVM svm.SVR(), svm.NuSVR(), svm.LinearSVR(), #Trees tree.DecisionTreeRegressor(), tree.ExtraTreeRegressor(), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBRegressor(), lgb.LGBMRegressor() ] #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit #note: this is an alternative to train_test_split cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10% #create table to compare MLA metrics MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean'] MLA_compare = pd.DataFrame(columns = MLA_columns) #index through MLA and save performance to table row_index = 0 for alg in MLA: #set name and parameters MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate rmse_scorer = make_scorer(rmse) cv_results = model_selection.cross_validate(alg, data, target, cv = cv_split, scoring = rmse_scorer) MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean() MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean() MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen! row_index+=1 #print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], inplace = True) MLA_compare.to_csv('mla_comparison.csv', index=True) print(MLA_compare)
def optimalRegression(x_train, x_test, y_train, y_test): # metrics mean_absolute_errors = [] # regression model models = [ linear_model.LinearRegression(), linear_model.Ridge(fit_intercept=True, alpha=0.0, random_state=0, normalize=True), linear_model.Lasso(alpha=0.1), linear_model.ElasticNet(), linear_model.Lars(n_nonzero_coefs=1), linear_model.LassoLars(), linear_model.OrthogonalMatchingPursuit(), linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6), linear_model.SGDRegressor(), MLPRegressor(solver='lbfgs'), linear_model.PassiveAggressiveRegressor(random_state=0), linear_model.RANSACRegressor(), linear_model.TheilSenRegressor(random_state=42), linear_model.HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100), Pipeline([('poly', PolynomialFeatures(degree=5, include_bias=False)), ('linreg', linear_model.LinearRegression(normalize=True))]) ] # model name names = [ 'Linear_Regression', 'Ridge_Regression', 'Lasso', 'Elastic_Net', 'Least_Angle_Regression', 'LARS_Lasso', 'Orthogonal_Matching_Pursuit', 'Logistic_Regression', 'Stochastic_Gradient_Descent', 'Perceptron_Algorithms', 'Passive-aggressive_Algorithms', 'RANSAC', 'Theil_SEN', 'Huber_Regression', 'Polynomial_Regression' ] for model in models: try: model.fit(x_train, y_train) predictions = cross_val_predict(model, x_test, y_test, cv=5) mean_absolute_errors.append( metrics.mean_absolute_error(y_test, predictions)) except: mean_absolute_errors.append('n/a') df = pd.DataFrame({ 'Model_reference': models, 'Model_name': names, 'Mean_absolute_err': mean_absolute_errors }) df.sort_values(by='Mean_absolute_err', ascending=True, inplace=True) df = df.reset_index(drop=True) print(df[['Model_name', 'Mean_absolute_err']]) print("Optimal model is " + str(df['Model_name'][0]) + " with error " + str(df['Mean_absolute_err'][0])) print("Second Optimal model is " + str(df['Model_name'][1]) + " with error " + str(df['Mean_absolute_err'][1])) name1 = str(df['Model_name'][0]) name2 = str(df['Model_name'][1]) model1 = None model2 = None for model, name in zip(models, names): if name == str(df['Model_name'][0]): model1 = model elif name == str(df['Model_name'][1]): model2 = model return model1, name1, model2, name2
# denoise discount using wavelet transform #Y = pd.Series(denoise_signal(Y)) #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 1-365/len(df), shuffle = False) X_train = load('Data/regression_train_X.npy', allow_pickle=True) X_test = load('Data/regression_test_X.npy', allow_pickle=True) Y_train = load('Data/regression_train_y.npy', allow_pickle=True) Y_test = load('Data/regression_test_y.npy', allow_pickle=True) ## train SVM regressors = [ svm.SVR(), # linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] name = [ 'svm.SVR', # 'SGDRegressor', 'BayesianRidge', 'LassoLars', 'ARDRegression', 'PassiveAggressiveRegressor', 'TheilSenRegressor', 'LinearRegression' ]
def get_regression_estimators(r, regression_models): if r == 'ARDRegression': regression_models[r] = linear_model.ARDRegression() elif r == 'BayesianRidge': regression_models[r] = linear_model.BayesianRidge() elif r == 'ElasticNet': regression_models[r] = linear_model.ElasticNet() elif r == 'ElasticNetCV': regression_models[r] = linear_model.ElasticNetCV() elif r == 'HuberRegressor': regression_models[r] = linear_model.HuberRegressor() elif r == 'Lars': regression_models[r] = linear_model.Lars() elif r == 'LarsCV': regression_models[r] = linear_model.LarsCV() elif r == 'Lasso': regression_models[r] = linear_model.Lasso() elif r == 'LassoCV': regression_models[r] = linear_model.LassoCV() elif r == 'LassoLars': regression_models[r] = linear_model.LassoLars() elif r == 'LassoLarsCV': regression_models[r] = linear_model.LassoLarsCV() elif r == 'LassoLarsIC': regression_models[r] = linear_model.LassoLarsIC() elif r == 'LinearRegression': regression_models[r] = linear_model.LinearRegression() elif r == 'LogisticRegression': regression_models[r] = linear_model.LogisticRegression() elif r == 'LogisticRegressionCV': regression_models[r] = linear_model.LogisticRegressionCV() elif r == 'MultiTaskElasticNet': regression_models[r] = linear_model.MultiTaskElasticNet() elif r == 'MultiTaskElasticNetCV': regression_models[r] = linear_model.MultiTaskElasticNetCV() elif r == 'MultiTaskLasso': regression_models[r] = linear_model.MultiTaskLasso() elif r == 'MultiTaskLassoCV': regression_models[r] = linear_model.MultiTaskLassoCV() elif r == 'OrthogonalMatchingPursuit': regression_models[r] = linear_model.OrthogonalMatchingPursuit() elif r == 'OrthogonalMatchingPursuitCV': regression_models[r] = linear_model.OrthogonalMatchingPursuitCV() elif r == 'PassiveAggressiveClassifier': regression_models[r] = linear_model.PassiveAggressiveClassifier() elif r == 'PassiveAggressiveRegressor': regression_models[r] = linear_model.PassiveAggressiveRegressor() elif r == 'Perceptron': regression_models[r] = linear_model.Perceptron() elif r == 'RANSACRegressor': regression_models[r] = linear_model.RANSACRegressor() elif r == 'Ridge': regression_models[r] = linear_model.Ridge() elif r == 'RidgeClassifier': regression_models[r] = linear_model.RidgeClassifier() elif r == 'RidgeClassifierCV': regression_models[r] = linear_model.RidgeClassifierCV() elif r == 'RidgeCV': regression_models[r] = linear_model.RidgeCV() elif r == 'SGDClassifier': regression_models[r] = linear_model.SGDClassifier() elif r == 'SGDRegressor': regression_models[r] = linear_model.SGDRegressor() elif r == 'TheilSenRegressor': regression_models[r] = linear_model.TheilSenRegressor() else: print( r + " is an unsupported regression type. Check if you have misspelled the name." )
def regress_sys(folder, all_videos, yfit, training_size, randselect=True, trainingdata=[], frame=0, have_output=True, download=True, bucket_name='ccurtis.data'): """Uses regression based on image intensities to select tracking parameters. This function uses regression methods from the scikit-learn module to predict the lower quality cutoff values for particle filtering in TrackMate based on the intensity distributions of input images. Currently only uses the first frame of videos for analysis, and is limited to predicting quality values. In practice, users will run regress_sys twice in different modes to build a regression system. First, set have_output to False. Function will return list of randomly selected videos to include in the training dataset. The user should then manually track particles using the Trackmate GUI, and enter these values in during the next round as the input yfit variable. Parameters ---------- folder : str S3 directory containing video files specified in all_videos. all_videos: list of str Contains prefixes of video filenames of entire video set to be tracked. Training dataset will be some subset of these videos. yfit: numpy.ndarray Contains manually acquired quality levels using Trackmate for the files contained in the training dataset. training_size : int Number of files in training dataset. randselect : bool If True, will randomly select training videos from all_videos. If False, will use trainingdata as input training dataset. trainingdata : list of str Optional manually selected prefixes of video filenames to be used as training dataset. have_output: bool If you have already acquired the quality values (yfit) for the training dataset, set to True. If False, it will output the files the user will need to acquire quality values for. bucket_name : str S3 bucket containing videos to be downloaded for regression calculations. Returns ------- regress_object : list of sklearn.svm.classes. Contains list of regression objects assembled from the training datasets. Uses the mean, 10th percentile, 90th percentile, and standard deviation intensities to predict the quality parameter in Trackmate. tprefix : list of str Contains randomly selected images from all_videos to be included in training dataset. """ if randselect: tprefix = [] for i in range(0, training_size): random.seed(i + 1) tprefix.append(all_videos[random.randint(0, len(all_videos))]) if have_output is False: print("Get parameters for: {}".format(tprefix[i])) else: tprefix = trainingdata if have_output is True: # Define descriptors descriptors = np.zeros((training_size, 4)) counter = 0 for name in tprefix: local_im = name + '.tif' remote_im = "{}/{}".format(folder, local_im) if download: aws.download_s3(remote_im, local_im, bucket_name=bucket_name) test_image = sio.imread(local_im) descriptors[counter, 0] = np.mean(test_image[frame, :, :]) descriptors[counter, 1] = np.std(test_image[frame, :, :]) descriptors[counter, 2] = np.percentile(test_image[frame, :, :], 10) descriptors[counter, 3] = np.percentile(test_image[frame, :, :], 90) counter = counter + 1 # Define regression techniques xfit = descriptors classifiers = [ svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] regress_object = [] for item in classifiers: clf = item regress_object.append(clf.fit(xfit, yfit)) return regress_object else: return tprefix
def __init__(self): ''' Class constructor or initialization method. ''' # keys and tokens from the Twitter Dev Console consumer_key = 'wELRpStXm3ClfLm1bmFNnHylH' consumer_secret = 'FHpTU0BBClgULhOMFrp2QyjaMcFg9LDWaNO2buyTQJ0WUtxyvW' access_token = '1236399499565608961-UtDzGjrLbcRevxCJRX2gAIv9s5HIhV' access_token_secret = 'MscQlrcL0vtGPBxct09tXTVxgwQD70UnOxEs0bY19X7yD' # attempt authentication try: # create OAuthHandler object self.auth = OAuthHandler(consumer_key, consumer_secret) # set access token and secret self.auth.set_access_token(access_token, access_token_secret) # create tweepy API object to fetch tweets self.api = tweepy.API(self.auth) except: print("Error: Authentication Failed") # creating object of TwitterClient Class # api = TwitterClient() # calling function to get tweets wSent = ["WSENT"] aSent = ["ASENT"] for index in range(3,8): day = datetime.date.today() - datetime.timedelta(days = index) wTweets = self.get_tweets(query = 'weather', count = 100, geocode='41.2565,-96.05,5mi', until=day) aTweets = self.get_tweets(query = '', count = 100, geocode='41.2565,-96.05,5mi', until = day) ptweets = [tweet for tweet in wTweets if tweet['sentiment'] == 'positive'] ntweets = [tweet for tweet in wTweets if tweet['sentiment'] == 'negative'] netPosSent = (len(ptweets)/len(wTweets)) - (len(ntweets)/len(wTweets)) wSent.append(netPosSent) ptweets = [tweet for tweet in aTweets if tweet['sentiment'] == 'positive'] ntweets = [tweet for tweet in aTweets if tweet['sentiment'] == 'negative'] netPosSent = (len(ptweets)/len(aTweets)) - (len(ntweets)/len(aTweets)) aSent.append(netPosSent) # print(wSent) # print(aSent) url = "https://www.ncei.noaa.gov/orders/cdo/2069913.csv" dataset = pandas.read_csv(url) dataset = dataset.drop(['STATION', 'NAME', 'DATE'], axis = 1) dataset['WSENT'] = wSent[1:] # dataset['ASENT'] = aSent[1:] dataset = dataset.dropna() # print(dataset.shape) classifiers = [ svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] trainingData = dataset.drop(['WSENT'], axis=1) trainingScores = dataset['WSENT'] predictionData = dataset.drop(['WSENT'], axis=1) global clf for item in classifiers: # print(item) clf = item clf.fit(trainingData, trainingScores) print(clf.predict(predictionData),'\n')