def _feature_selection(self, model, train_data, test_data, target_count): features = util.get_predictors(train_data).columns selected_features = dict() while (len(features) > target_count): logging.info('Checking %d features', len(features)) # DEBUG error = dict(application=[], err_type=[], model=[], cand=[], pred=[], actual=[]) # Remove a single feature at a time for i in range(0, len(features)): candidates = list(features) candidates.pop(i) # logging.info('Candidates: %s', str(candidates)) inserted = 0 # DEBUG # Build the models based on the data for app, data in train_data.groupby('application'): test = test_data[test_data.application == app] samples = {'train': { 'X': util.get_predictors(data)[candidates], 'y': data.time}, 'test': {'X': util.get_predictors(test)[candidates], 'y': test.time}} model.fit(samples['train']['X'], samples['train']['y']) pred = model.predict(samples['test']['X']) actual = samples['test']['y'] # logging.info('Adding %d for candidate %d with type %s', len(actual), i, stype) for j in range(0, len(pred)): error['application'].append(app) error['model'].append(str(model)) error['err_type'].append('test') error['pred'].append(pred[j]) error['actual'].append(actual[j:j]) error['cand'].append(i) inserted = inserted + 1 # DEBUG logging.info('Inserted %d for candidate %d', inserted, i) # Now post process to find the actual best candiate for removal data = pd.DataFrame(error) errs = data[data.err_type == 'test'] candidate_errors = [] for candidate in range(0, len(features)): logging.info('Checking candidate %d', candidate) d = errs[errs.cand == candidate] if len(d.actual) == 0: logging.error('Candidate %d for model %s has no values', candidate, str(model)) continue logging.info('In _feature_selection: len(d.actual) = %d, len(d.pred) = %d', len(d.actual), len(d.pred)) candidate_error = self._cost_function(d.actual.values, d.pred.values) candidate_errors.append(candidate_error) print "Errors: %s" % (str(candidate_errors)) candidate = np.argmax(candidate_errors) args = (features[candidate], candidate_errors[candidate]) print "Selected candidate %s with error %d" % args features.pop(candidate) selected_features[len(features)] = features return selected_features
def use_models(self, train_data, test_data): y_train = train_data['time'] X_train = util.get_predictors(train_data) y_test = test_data['time'] X_test = util.get_predictors(test_data) errors = {} for model in self.models: model = model() model.fit(X_train, y_train) errors[str(model)] = {} errors[str(model)]['test'] = abs(util.relative_error(y_test, model.predict(X_test))) errors[str(model)]['train'] = abs(util.relative_error(y_train, model.predict(X_train))) return errors
def analyze(self, train_data, test_data, models): keys = ['application'] errors = dict(application=[], model=[], error=[], reps=[], error_type=[]) indexes = range(0, len(self.models)) model_idx = { str(model()): idx for (model, idx) in zip(self.models, indexes) } self.test_error = np.zeros((len(self.models), len(self.nreps))) self.train_error = np.zeros((len(self.models), len(self.nreps))) for reps in self.nreps: for app, group in train_data.groupby(keys): app_data = group[group['rep'] <= reps] test = test_data[test_data.application == app] data = { 'train': (util.get_predictors(app_data), app_data['time']), 'test': (util.get_predictors(test), test['time']) } for model in self.models: model = model() model.fit(data['train'][0], data['train'][1]) # Find predictions over the test set for t, (X, y) in data.items(): pred = model.predict(X) error = abs(util.relative_error(y, pred)) for err in error.values: errors['application'].append(app) errors['model'].append(str(model)) errors['error'].append(err) errors['reps'].append(reps) errors['error_type'].append(t) self.errors = pd.DataFrame(errors) return self
def cross_validation(k, X, y, random_forest=False, use_confidence=False, num_of_trees=1): accuracies = [] y_pred = [] y_true = [] predictors = get_predictors() emotion_values = get_emotion_values() X_splits = np.array_split(X, k) y_splits = np.array_split(y, k) for i in range(k): X_train, X_test, y_train, y_test = get_train_test_split( X_splits, y_splits, i) emotion_predictor = EmotionPredictor(predictors, random_forest, use_confidence, num_of_trees) emotion_predictor.fit(emotion_values, X_train, y_train) predictions = emotion_predictor.predict(X_test) y_pred = y_pred + predictions # print(y_test) # print(y_true) for elem in y_test: y_true.append(elem) # y_true = y_true + y_test correct = sum([ 1 for i, prediction in enumerate(predictions) if prediction == y_test[i] ]) accuracy = float(correct * 100) / len(y_test) accuracies.append(accuracy) print("Accuracy for round {0} is {1:.2f}".format(i + 1, accuracy)) print( "Cross Validation accuracy has a mean of {0:.2f} and a std of {1:.2f}". format(np.mean(accuracies), np.std(accuracies))) print(" prec, rec, f1") for emotion_number in emotion_values: print("Emotion {0}: {1:.2f}, {2:.2f}, {3:.2f}".format( emotion_number, get_precision(y_true, y_pred, emotion_number), get_recall(y_true, y_pred, emotion_number), get_f1_score(y_true, y_pred, emotion_number))) plt.figure() cfm = confusion_matrix(y_true, y_pred) / k plot_confusion_matrix(cfm, classes=["1", "2", "3", "4", "5", "6"]) plt.show()
def analyze(self, train_data, test_data, models): keys = ['application'] error = dict(application=[], model=[], model_nice_name=[], error=[]) grouped = test_data.groupby(keys) for app, group in grouped: y = group['time'] X = util.get_predictors(group).values for model_name in models[app]: model = models[app][model_name] pred = model.predict(X) res = util.relative_error(y, pred) for err in res.values: error['error'].append(err) error['model'].append(model_name) error['model_nice_name'].append(str(model)) error['application'].append(app) self.error = pd.DataFrame(error) return self
def analyze(self, train_data, test_data, models): errors = dict(application=[], error=[], model=[]) grouped = test_data.groupby('application') for app, group in grouped: for model_name in models[app]: model = models[app][model_name] # Only want the predictors, drop everything else y = group['time'] X = util.get_predictors(group).values pred = model.predict(X) # Parse and combine data res = abs(util.relative_error(y, pred)) for err in res.values: errors['error'].append(err) errors['application'].append(app) errors['model'].append(str(model)) self.errors = pd.DataFrame(errors) return self
def analyze(self, train_data, test_data, models): keys = ['application', 'interference', 'coloc', 'nice'] error = dict(application=[], interference=[], model=[], coloc=[], nice=[], pred_rmse=[], base_rmse=[]) self.colors = { str(model): model._color for name, model in models.values()[0].items() } for (app, thread, coloc, nice), group in test_data.groupby(keys): base_rmse = 0 y = group['time'] X = util.get_predictors(group).values mean = np.mean(y) base_rmse = util.rmse_error(y, mean) for model_name in models[app]: if model_name == 'mean': continue model = models[app][model_name] pred = model.predict(X) pred_rmse = util.rmse_error(y, pred) error['model'].append(str(model)) error['pred_rmse'].append(pred_rmse) error['base_rmse'].append(base_rmse) error['application'].append(app) error['interference'].append(thread) error['coloc'].append(coloc) error['nice'].append(nice) self.error = pd.DataFrame(error) return self
import pickle from emotion_predictor import EmotionPredictor from util import get_clean_data, get_predictors, get_emotion_values X, y = get_clean_data() predictors = get_predictors() emotion_values = get_emotion_values() emotion_predictor = EmotionPredictor(predictors, random_forest=True, use_confidence=True, num_of_trees=200) emotion_predictor.fit(emotion_values, X, y) with open('emotion_predictor.pickle', 'wb') as f: pickle.dump(emotion_predictor, f, pickle.HIGHEST_PROTOCOL)
def analyze(self, train_data, test_data, models): # models = [{'model': linear_model.LinearRegression(), # 'grid': {}, # 'name': 'Linear', # 'color': 'blue'}, # {'model': linear_model.Ridge(), # 'grid': [{'regressor__alpha': util.frange(0, 10, 0.2)}], # 'name': 'Ridge', # 'color': 'red'}, # {'model': ensemble.GradientBoostingRegressor(), # 'grid': [{'regressor__learning_rate': util.frange(0.05, 1, 0.05), # 'regressor__n_estimators': range(20, 300, 20), # 'regressor__max_depth': range(2, 7) # }], # 'name': 'GBM', # 'color': 'yellow' # }, ## {'model': svm.SVR(kernel='poly'), ## 'grid': [{ ## 'regressor__degree': range(1, 4), ## 'regressor__C': [10**i for i in range(-5, 6)] ## }], ## 'name': 'SVMPoly' ## }, # {'model': svm.SVR(kernel='linear'), # 'grid': [{ # 'regressor__C': [10**i for i in range(-5, 6)] # }], # 'name': 'SVMLinear', # 'color': 'green' # } # ] # # errors = dict(application=[], model=[], feature_count=[], error=[], error_type=[]) # features = dict(application=[], model=[], feature=[], count=[]) # # max_feature_count = len(util.get_predictors(train_data).columns) # for feature_count in range(4, (max_feature_count / 2) * 2, 2): # for app, group in train_data.groupby('application'): # for model_params in models: # model = model_params['model'] # grid = model_params['grid'] # name = model_params['name'] # # pipeline = build_pipeline(model) # rfe = RFE(pipeline, feature_count, step=1) # cv = GridSearchCV(rfe, grid, cv=10) # test = test_data[test_data['application'] == app] # # X_train = util.get_predictors(group) # y_train = group['time'] # X_test = util.get_predictors(test) # y_test = test['time'] # # cv.fit(X_train, y_train) # # # Build feature heatmap # for feature in self._extract_features(rfe, X_train): # features['application'].append(app) # features['model'].append(name) # features['feature'].append(feature) # features['count'].append(feature) # # types = {'train': (X_train, y_train), 'test': (X_test, y_test)} # for err_type, (X, y) in types: # pred = rfe.predict(X) # for error in util.relative_error(y, pred): # errors['application'].append(app) # errors['model'].append(str(model)) # errors['feature_count'].append(feature_count) # errors['error'].append(error) # errors['error_type'].append('train') # self.errors = pd.DataFrame(errors) # # # Fetch minimum count for each feature, application, and model # features = pd.DataFrame(features) # self.features = dict(application=[], model=[], feature=[], count=[]) # for model, model_group in features.groupby('model'): # for app, app_group in model_group.groupby('application'): # for feature, feature_group in app_group.groupby('feature'): # min_count = feature_group.feature_count.min() # self.features['application'].append(app) # self.features['model'].append(app) # self.features['feature'].append(feature) # self.features['count'].append(min_count) # self.features = pd.DataFrame(self.features) feature_choices = self.feature_selection(train_data, test_data, models) models = {str(model()): model() for model in self._models} errors = dict(application=[], model=[], error=[], feature_count=[]) for model, feature_selection in feature_choices.items(): for feature_count, features in feature_selection: model = models[model] for app, group in train_data.groupby('application'): test = test_data[test_data.application == app] data = {'train': {'X': util.get_predictors(group)[features], 'y': group.time}, 'test': {'X': util.get_predictors(test)[fetures], 'y': test.time}} model.fit(data['train']['X'], data['train']['y']) for err_type, d in data.items(): pred = model.pred(d['X']) actual = d['y'] error = util.relative_error(actual, pred) for i in range(0, len(error)): errors['application'].append(app) errors['model'].append(str(model)) errors['error'].append(error[i]) errors['feature_count'].append(len(features)) self.errors = pd.DataFrame(errors) return self