Esempio n. 1
0
    def _feature_selection(self, model, train_data, test_data, target_count):
        features = util.get_predictors(train_data).columns
        selected_features = dict()
        while (len(features) > target_count):
            logging.info('Checking %d features', len(features)) # DEBUG
            
            error = dict(application=[], err_type=[], model=[], cand=[], pred=[], actual=[])
            # Remove a single feature at a time
            for i in range(0, len(features)):
                candidates = list(features)
                candidates.pop(i)
#               logging.info('Candidates: %s', str(candidates))
                inserted = 0 # DEBUG


                # Build the models based on the data
                for app, data in train_data.groupby('application'):
                    test = test_data[test_data.application == app]
                    samples = {'train': { 'X': util.get_predictors(data)[candidates], 'y': data.time},
                            'test': {'X': util.get_predictors(test)[candidates], 'y': test.time}}
                    model.fit(samples['train']['X'], samples['train']['y'])
                     
                    pred = model.predict(samples['test']['X'])
                    actual = samples['test']['y']
    #               logging.info('Adding %d for candidate %d with type %s', len(actual), i, stype)
                    for j in range(0, len(pred)):
                        error['application'].append(app)
                        error['model'].append(str(model))
                        error['err_type'].append('test')
                        error['pred'].append(pred[j])
                        error['actual'].append(actual[j:j])
                        error['cand'].append(i)
                        inserted = inserted + 1 # DEBUG
                logging.info('Inserted %d for candidate %d', inserted, i)

            # Now post process to find the actual best candiate for removal
            data = pd.DataFrame(error)
            errs = data[data.err_type == 'test']
            candidate_errors = []
            for candidate in range(0, len(features)):
                logging.info('Checking candidate %d', candidate)
                d = errs[errs.cand == candidate]
                if len(d.actual) == 0:
                    logging.error('Candidate %d for model %s has no values', candidate, str(model))
                    continue
                logging.info('In _feature_selection: len(d.actual) = %d, len(d.pred) = %d', len(d.actual), len(d.pred))
                candidate_error = self._cost_function(d.actual.values, d.pred.values)
                candidate_errors.append(candidate_error)

            print "Errors: %s" % (str(candidate_errors))
            candidate = np.argmax(candidate_errors)
            args = (features[candidate], candidate_errors[candidate])
            print "Selected candidate %s with error %d" % args
            features.pop(candidate)
        
            selected_features[len(features)] = features
        return selected_features
Esempio n. 2
0
 def use_models(self, train_data, test_data):
     y_train = train_data['time']
     X_train = util.get_predictors(train_data)
     y_test = test_data['time']
     X_test = util.get_predictors(test_data)
     errors = {}
     for model in self.models:
         model = model()
         model.fit(X_train, y_train)
         errors[str(model)] = {}
         errors[str(model)]['test'] = abs(util.relative_error(y_test, model.predict(X_test)))
         errors[str(model)]['train'] = abs(util.relative_error(y_train, model.predict(X_train)))
     return errors
    def analyze(self, train_data, test_data, models):

        keys = ['application']
        errors = dict(application=[],
                      model=[],
                      error=[],
                      reps=[],
                      error_type=[])

        indexes = range(0, len(self.models))
        model_idx = {
            str(model()): idx
            for (model, idx) in zip(self.models, indexes)
        }

        self.test_error = np.zeros((len(self.models), len(self.nreps)))
        self.train_error = np.zeros((len(self.models), len(self.nreps)))

        for reps in self.nreps:
            for app, group in train_data.groupby(keys):
                app_data = group[group['rep'] <= reps]
                test = test_data[test_data.application == app]

                data = {
                    'train': (util.get_predictors(app_data), app_data['time']),
                    'test': (util.get_predictors(test), test['time'])
                }
                for model in self.models:
                    model = model()
                    model.fit(data['train'][0], data['train'][1])
                    # Find predictions over the test set
                    for t, (X, y) in data.items():
                        pred = model.predict(X)
                        error = abs(util.relative_error(y, pred))
                        for err in error.values:
                            errors['application'].append(app)
                            errors['model'].append(str(model))
                            errors['error'].append(err)
                            errors['reps'].append(reps)
                            errors['error_type'].append(t)
        self.errors = pd.DataFrame(errors)
        return self
def cross_validation(k,
                     X,
                     y,
                     random_forest=False,
                     use_confidence=False,
                     num_of_trees=1):
    accuracies = []
    y_pred = []
    y_true = []
    predictors = get_predictors()
    emotion_values = get_emotion_values()

    X_splits = np.array_split(X, k)
    y_splits = np.array_split(y, k)

    for i in range(k):
        X_train, X_test, y_train, y_test = get_train_test_split(
            X_splits, y_splits, i)

        emotion_predictor = EmotionPredictor(predictors, random_forest,
                                             use_confidence, num_of_trees)
        emotion_predictor.fit(emotion_values, X_train, y_train)

        predictions = emotion_predictor.predict(X_test)
        y_pred = y_pred + predictions
        # print(y_test)
        # print(y_true)
        for elem in y_test:
            y_true.append(elem)
        # y_true = y_true + y_test
        correct = sum([
            1 for i, prediction in enumerate(predictions)
            if prediction == y_test[i]
        ])

        accuracy = float(correct * 100) / len(y_test)
        accuracies.append(accuracy)
        print("Accuracy for round {0} is {1:.2f}".format(i + 1, accuracy))

    print(
        "Cross Validation accuracy has a mean of {0:.2f} and a std of {1:.2f}".
        format(np.mean(accuracies), np.std(accuracies)))

    print("          prec, rec, f1")
    for emotion_number in emotion_values:
        print("Emotion {0}: {1:.2f}, {2:.2f}, {3:.2f}".format(
            emotion_number, get_precision(y_true, y_pred, emotion_number),
            get_recall(y_true, y_pred, emotion_number),
            get_f1_score(y_true, y_pred, emotion_number)))

    plt.figure()
    cfm = confusion_matrix(y_true, y_pred) / k
    plot_confusion_matrix(cfm, classes=["1", "2", "3", "4", "5", "6"])
    plt.show()
Esempio n. 5
0
    def analyze(self, train_data, test_data, models):
        keys = ['application']
        error = dict(application=[], model=[], model_nice_name=[], error=[])
        grouped = test_data.groupby(keys)
        for app, group in grouped:
            y = group['time']
            X = util.get_predictors(group).values
            for model_name in models[app]:
                model = models[app][model_name]
                pred = model.predict(X)

                res = util.relative_error(y, pred)
                for err in res.values:
                    error['error'].append(err)
                    error['model'].append(model_name)
                    error['model_nice_name'].append(str(model))
                    error['application'].append(app)
        self.error = pd.DataFrame(error)
        return self
    def analyze(self, train_data, test_data, models):
        errors = dict(application=[], error=[], model=[])
        grouped = test_data.groupby('application')

        for app, group in grouped:
            for model_name in models[app]:
                model = models[app][model_name]

                # Only want the predictors, drop everything else 
                y = group['time']
                X = util.get_predictors(group).values
                pred = model.predict(X)
                
                # Parse and combine data
                res = abs(util.relative_error(y, pred))
                for err in res.values:
                    errors['error'].append(err)
                    errors['application'].append(app)
                    errors['model'].append(str(model))
        self.errors = pd.DataFrame(errors)
        return self
Esempio n. 7
0
    def analyze(self, train_data, test_data, models):
        keys = ['application', 'interference', 'coloc', 'nice']
        error = dict(application=[],
                     interference=[],
                     model=[],
                     coloc=[],
                     nice=[],
                     pred_rmse=[],
                     base_rmse=[])

        self.colors = {
            str(model): model._color
            for name, model in models.values()[0].items()
        }

        for (app, thread, coloc, nice), group in test_data.groupby(keys):
            base_rmse = 0
            y = group['time']
            X = util.get_predictors(group).values
            mean = np.mean(y)
            base_rmse = util.rmse_error(y, mean)
            for model_name in models[app]:
                if model_name == 'mean':
                    continue
                model = models[app][model_name]
                pred = model.predict(X)
                pred_rmse = util.rmse_error(y, pred)
                error['model'].append(str(model))
                error['pred_rmse'].append(pred_rmse)
                error['base_rmse'].append(base_rmse)
                error['application'].append(app)
                error['interference'].append(thread)
                error['coloc'].append(coloc)
                error['nice'].append(nice)
        self.error = pd.DataFrame(error)
        return self
import pickle
from emotion_predictor import EmotionPredictor

from util import get_clean_data, get_predictors, get_emotion_values

X, y = get_clean_data()
predictors = get_predictors()
emotion_values = get_emotion_values()

emotion_predictor = EmotionPredictor(predictors, random_forest=True, use_confidence=True, num_of_trees=200)
emotion_predictor.fit(emotion_values, X, y)

with open('emotion_predictor.pickle', 'wb') as f:
    pickle.dump(emotion_predictor, f, pickle.HIGHEST_PROTOCOL)

Esempio n. 9
0
    def analyze(self, train_data, test_data, models):
#        models = [{'model': linear_model.LinearRegression(),
#                   'grid': {},
#                   'name': 'Linear',
#                   'color': 'blue'}, 
#                  {'model': linear_model.Ridge(),
#                   'grid': [{'regressor__alpha': util.frange(0, 10, 0.2)}],
#                   'name': 'Ridge',
#                   'color': 'red'},
#                  {'model': ensemble.GradientBoostingRegressor(),
#                   'grid': [{'regressor__learning_rate': util.frange(0.05, 1, 0.05),
#                             'regressor__n_estimators': range(20, 300, 20),
#                             'regressor__max_depth': range(2, 7)
#                            }],
#                   'name': 'GBM',
#                   'color': 'yellow'
#                  },
##                  {'model': svm.SVR(kernel='poly'),
##                   'grid': [{
##                            'regressor__degree': range(1, 4),
##                            'regressor__C': [10**i for i in range(-5, 6)]
##                       }],
##                   'name': 'SVMPoly'
##                  },
#                  {'model': svm.SVR(kernel='linear'),
#                   'grid': [{
#                            'regressor__C': [10**i for i in range(-5, 6)]
#                       }],
#                   'name': 'SVMLinear',
#                   'color': 'green'
#                  }
#                ]
#
#        errors = dict(application=[], model=[], feature_count=[], error=[], error_type=[])
#        features = dict(application=[], model=[], feature=[], count=[]) 
#        
#        max_feature_count = len(util.get_predictors(train_data).columns)
#        for feature_count in range(4, (max_feature_count / 2) * 2, 2):
#            for app, group in train_data.groupby('application'):
#                for model_params in models:
#                    model = model_params['model']
#                    grid = model_params['grid']
#                    name = model_params['name']
#
#                    pipeline = build_pipeline(model)
#                    rfe = RFE(pipeline, feature_count, step=1)
#                    cv = GridSearchCV(rfe, grid, cv=10)
#                    test = test_data[test_data['application'] == app]
#                    
#                    X_train = util.get_predictors(group)
#                    y_train = group['time']
#                    X_test = util.get_predictors(test)
#                    y_test = test['time']
#                    
#                    cv.fit(X_train, y_train)
#                
#                    # Build feature heatmap
#                    for feature in self._extract_features(rfe, X_train):
#                        features['application'].append(app)
#                        features['model'].append(name)
#                        features['feature'].append(feature)
#                        features['count'].append(feature)
#
#                    types = {'train': (X_train, y_train), 'test': (X_test, y_test)}
#                    for err_type, (X, y) in types:
#                        pred = rfe.predict(X)
#                        for error in util.relative_error(y, pred):
#                            errors['application'].append(app)
#                            errors['model'].append(str(model))
#                            errors['feature_count'].append(feature_count)
#                            errors['error'].append(error)
#                            errors['error_type'].append('train')
#        self.errors = pd.DataFrame(errors)
#        
#        # Fetch minimum count for each feature, application, and model
#        features = pd.DataFrame(features)
#        self.features = dict(application=[], model=[], feature=[], count=[])
#        for model, model_group in features.groupby('model'):
#            for app, app_group in model_group.groupby('application'):
#                for feature, feature_group in app_group.groupby('feature'):
#                    min_count = feature_group.feature_count.min()
#                    self.features['application'].append(app)
#                    self.features['model'].append(app)
#                    self.features['feature'].append(feature)
#                    self.features['count'].append(min_count)
#        self.features = pd.DataFrame(self.features)
        feature_choices = self.feature_selection(train_data, test_data, models)

        models = {str(model()): model() for model in self._models}

        errors = dict(application=[], model=[], error=[], feature_count=[])
        for model, feature_selection in feature_choices.items():
            for feature_count, features in feature_selection:
                model = models[model]
                for app, group in train_data.groupby('application'):
                    test = test_data[test_data.application == app]
                    data = {'train': {'X': util.get_predictors(group)[features], 'y': group.time},
                            'test': {'X': util.get_predictors(test)[fetures], 'y': test.time}}
                    model.fit(data['train']['X'], data['train']['y'])
                    for err_type, d in data.items():
                        pred = model.pred(d['X'])
                        actual = d['y']
                        error = util.relative_error(actual, pred)
                        for i in range(0, len(error)):
                            errors['application'].append(app)
                            errors['model'].append(str(model))
                            errors['error'].append(error[i])
                            errors['feature_count'].append(len(features))
        self.errors = pd.DataFrame(errors)
        return self