def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, all_features, "target")
        if "depth" in parameters:
            model = RandomForestRegressor(
                max_depth=parameters["depth"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)
        elif "leaf" in parameters:
            model = RandomForestRegressor(
                min_samples_leaf=parameters["leaf"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)
        elif "max_leaf" in parameters:
            model = RandomForestRegressor(
                max_leaf_nodes=parameters["max_leaf"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)

        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        all_obs.extend(testY)
        all_pred.extend(prediction)
    return rmseEval(all_obs, all_pred)[1]
    def test_boston_housing_load_save_valid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error

        num_models_settings = [1, 2]
        for num_models in num_models_settings:
            explainer = CXPlain(explained_model, model_builder, masking_operation, loss,
                                num_models=num_models)

            explainer.fit(x_train, y_train)
            median_1 = explainer.predict(x_test)

            tmp_dir_name = tempfile.mkdtemp()
            explainer.save(tmp_dir_name)

            with self.assertRaises(ValueError):
                explainer.save(tmp_dir_name, overwrite=False)

            explainer.save(tmp_dir_name, overwrite=True)
            explainer.load(tmp_dir_name)
            median_2 = explainer.predict(x_test)

            self.assertTrue(np.array_equal(median_1, median_2))

            shutil.rmtree(tmp_dir_name)  # Cleanup.
    def test_boston_housing_valid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=2,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error
        explainer = CXPlain(explained_model, model_builder, masking_operation,
                            loss)

        explainer.fit(x_train, y_train)
        self.assertEqual(explainer.prediction_model.output_shape,
                         (None, np.prod(x_test.shape[1:])))

        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)
Beispiel #4
0
def eval_one(step):
    
    if step in cached_results:
        return cached_results[step]
    
    eval_features = []
    for i in range(0, len(all_features)):
        if step[i]:
            eval_features.append(all_features[i])
    
    all_predictions = []
    all_observations = []
    
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, eval_features, "target")
        model = RandomForestRegressor(min_samples_leaf = 2, random_state=42, n_estimators=650, n_jobs=-1)
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        all_observations.extend(testY)
        all_predictions.extend(predictions)
    
    rmse = rmseEval(all_observations, all_predictions)[1]
    
    cached_results[step] = rmse
    
    # save down the cached result
    
    cache_output = open(CACHE_FILE, "a")
    step_list = [str(s) for s in step]
    step_str = ",".join(step_list)  
    cache_output.write(str(rmse) + ";" + step_str + "\n")
    cache_output.close()
    
    return rmse
Beispiel #5
0
    def post(self):

        # upload audio file in server
        voice = self.request.files["audio"][0]
        extn = os.path.splitext(voice['filename'])[1]
        fnm = os.path.splitext(voice['filename'])[0]
        cname = str(uuid.uuid4()) + extn
        fh = open(__UPLOADS__ + cname, 'w')
        fh.write(voice['body'])
        fh.close()

        # get features from the audio file
        attr = getAttributes(cname)
        fdf = mongoTolist(False)

        train = fdf[:,:-1]
        target = fdf[:,-1]

        #RandomForest Regression
        rf = RandomForestRegressor(n_estimators = 506, n_jobs = -1)
        rf.fit(train, target)

        updrs_val = rf.predict([attr])
        attr.append(updrs_val[0])

        # get the theta from database
        theta = list(db.theta.find({}))
        theta1 = theta[0]["theta1"]
        theta2 = theta[1]["theta2"]

        # check is the person has Parkinson's Disease
        isParkinson = octave.classify(theta1, theta2, np.array(attr))

        self.render("output.html", ipk = isParkinson, updrs = updrs_val[0])
Beispiel #6
0
class RandomForestRegressorImpl():

    def __init__(self, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
Beispiel #7
0
def eval_one(min_samples_leaf, n_estimators):
    log("min_samples_leaf: " + str(min_samples_leaf) + ", n_estimators: " +
        str(n_estimators))

    all_observations = []
    all_pred_ALL = []

    for group in range(0, len(groups)):
        trainStations = []
        for i in range(0, len(groups)):
            if i != group:
                trainStations.extend(groups[i])
        testStations = groups[group]

        train_station_set = set([float(s) for s in trainStations])
        test_station_set = set([float(s) for s in testStations])

        trainX, testX, trainY, testY = splitDataForXValidation(
            train_station_set, test_station_set, "location", data,
            all_features, "target")
        model = RandomForestRegressor(min_samples_leaf=min_samples_leaf,
                                      n_estimators=n_estimators,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX, trainY)
        prediction_ALL = model.predict(testX)
        rmse = rmseEval(testY, prediction_ALL)[1]
        log("\tALL rmse: " + str(rmse))
        all_observations.extend(testY)
        all_pred_ALL.extend(prediction_ALL)

    rmse = rmseEval(all_observations, all_pred_ALL)[1]
    log("\tALL rmse:" + str(rmse))
    return rmse
    def test_boston_housing_no_fit_invalid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=2,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error
        explainer = CXPlain(explained_model, model_builder, masking_operation,
                            loss)

        with self.assertRaises(AssertionError):
            explainer.predict(x_test, y_test)

        with self.assertRaises(AssertionError):
            explainer.score(x_test, y_test)
def trainRandomForest(data, columns, targetColumn, parameters):

    modelColumns = []
    for column in columns:
        if column != targetColumn:
            modelColumns.append(column)

    modelData = []

    for i in range(0, len(data[targetColumn])):
        record = []
        for column in modelColumns:
            record.append(data[column][i])

        modelData.append(record)
    if "depth" in parameters:
        model = RandomForestRegressor(max_depth=parameters["depth"],
                                      n_estimators=parameters["estimators"],
                                      n_jobs=-1,
                                      random_state=42)
    elif "leaf" in parameters:
        model = RandomForestRegressor(min_samples_leaf=parameters["leaf"],
                                      n_estimators=parameters["estimators"],
                                      n_jobs=-1,
                                      random_state=42)

    model.fit(modelData, data[targetColumn])

    return RandomForestModel(model, modelColumns)
def RandomForest(x_train,y_train,x_test,degree):    
     params = {'n_estimators': 1000, 'max_depth': degree, 'min_samples_split': 1,'warm_start':True}
     clf = RandomForestRegressor(**params)
     clf.fit(x_train, y_train)          
     y_predict = clf.predict(x_test)
     #plt.plot(x_test,y_predict,color='red')
     return y_predict
Beispiel #11
0
    def randomforestregressor(self, testlen, ntrain, ntrees, nodes):
        hsmadata = self.hsmadata
        dates = pd.Series(hsmadata['date'].unique()).sort_values()
        dates.index = range(0, len(dates))
        ntest = len(dates) // testlen

        hsma = pd.DataFrame()
        for i in range(ntrain, ntest):
            traindata = hsmadata[
                (hsmadata['date'] >= dates[(i - ntrain) * testlen])
                & (hsmadata['date'] < dates[i * testlen - self.day])].copy()
            testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen]) & (
                hsmadata['date'] < dates[(i + 1) * testlen])].copy()

            traindata = traindata.iloc[:, 2:]
            traindatax = traindata.drop(['closeratio'], 1)
            traindatay = traindata['closeratio']
            testdatax = testdata[traindatax.columns]

            treemodel = RandomForestRegressor(
                n_estimators=ntrees,
                min_samples_split=nodes * 2,
                min_samples_leaf=nodes)
            treemodel.fit(traindatax, traindatay)
            testdata['predratio'] = treemodel.predict(testdatax)

            hsma = pd.concat([hsma, testdata], ignore_index=True)

        return (hsma)
 def test_shap_summary(self):
     data = self.iris.copy()
     widget = self.widget
     rf = SKL_RF(n_estimators=10)
     model = RandomForestRegressor(rf)
     rf.fit(data.X, data.Y)
     #self.send_signals([(widget.Inputs.data, data), (widget.Inputs.model, model)])
Beispiel #13
0
    def test_boston_housing_confidence_level_invalid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=3,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error

        num_models = 2
        explainer = CXPlain(explained_model,
                            model_builder,
                            masking_operation,
                            loss,
                            num_models=num_models)

        explainer.fit(x_train, y_train)

        invalid_confidence_levels = [1.01, -0.5, -0.01]

        for confidence_level in invalid_confidence_levels:
            with self.assertRaises(ValueError):
                explainer.predict(x_test, confidence_level=confidence_level)
Beispiel #14
0
def randomForest(trainFeatures, trainResponses, testFeatures, maxFeatures = 'log2', nTree=100):
    ## Settings of random forests regressor
    regModel = RandomForestRegressor(n_estimators=nTree, max_features=maxFeatures)    
    ## Train the random forests regressor
    regModel.fit(trainFeatures, trainResponses)
    ## Prediction
    testResponsesPred = regModel.predict(testFeatures)
    return testResponsesPred
Beispiel #15
0
def evalTrainStationTestStation(trainStation, testStation, features):
    trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target")
    _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target")
    model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42)
    model.fit(trainX, trainY)
    prediction = model.predict(testX2)
    rmse = rmseEval(testY2, prediction)[1]
    print("Training on station " + str(trainStation) + ", applying on station " + str(testStation) + ": rmse: " + str(rmse))
    return rmse
def RF_Model(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    RFModel = RandomForestRegressor()
    RFModel.fit(Scaled_Input_Data, Output_Data)
    RF_Time = time.time() - T0
    print('The computational time of Random Forest Regression for ', n, ' examples is: ', RF_Time)
    MSEs_RF = cross_validation.cross_val_score(RFModel, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_RF = np.mean(list(MSEs_RF))
    print('The average MSE of Random Forest Regression for ', n, ' examples is: ', (-1*MeanMSE_RF))
    return(MeanMSE_RF, RFModel)
 def run(self):
     print "Reading device separations..."
     indexes = np.load("indexesTrain.npy")
     self.train = self.train.values
     print "Getting attributes..."
     trainFeatures = [self.getMainFeatures(self.train, indexes, i) for i in range(len(indexes))]
     for i in range(len(indexes)):
         (trainVect, targetVect) = self.getAttributes(trainFeatures, indexes, i)
         classifier = RandomForestRegressor(n_estimators=500, verbose=2, n_jobs=4, random_state=1)
         classifier.fit(trainVect, targetVect)
         pickle.dump(classifier, open("models/models" + str(i) + ".mod", "w"))
Beispiel #18
0
    def rf_lc(self, trainX, trainY):
        trainY_t          = self.target_transform( trainY)

        clf = RandomForestRegressor( **self.rf_hyper)
        clf.fit(  trainX, trainY_t)

        (mean_train, mean_test, max_train, max_test) = self.compute_error(clf, trainX, trainY)
        print ("mean_train err, mean_test err, followed by max: ", ( mean_train, mean_test, max_train, max_test))
        self.log( ( mean_train, mean_test, max_train, max_test))

        return (clf, mean_train, mean_test, max_train, max_test)
Beispiel #19
0
def train_model(X_train, y_train):
    model = RandomForestRegressor(n_estimators=50,
                                  criterion='mse',
                                  max_features='auto',
                                  max_depth=25,
                                  min_samples_split=1e-4,
                                  min_samples_leaf=1e-5,
                                  n_jobs=-1,
                                  verbose=10)
    model.fit(X_train, y_train)
    return model
Beispiel #20
0
def randomForestFeatures(df, X_train, y_train):
    """
    INPUT: A dataframe to, X_train, y_train
    OUTPUT: A list of tuples ranking the feature importance generated from the
    PURPOSE: To identify in a robust easy-to-use way what features are most relevant
    """
    names = df.iloc[:,1:-1].columns

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    tups = (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),
                 reverse=True))
    return tups
Beispiel #21
0
def train_model(X_train, y_train):
    print("training the model ...")
    rf = RandomForestRegressor(n_estimators=500,
                               max_depth=5,
                               n_jobs=-1,
                               verbose=2)
    rf.fit(X_train, y_train)
    y_pred_train = rf.predict(X_train)
    print(".. training RMSE : {:0.3f} %".format(
        mean_squared_error(y_train, y_pred_train) * 100))
    #print(".. training R2   : {:0.3f} %".format(r2_score(y_train,y_pred_train)*100))
    print(".. training MAE  : {:0.3f} %".format(
        mean_absolute_error(y_train, y_pred_train) * 100))
    return rf
Beispiel #22
0
def RandomForest(weiboid, x_train, y_train, x_test, y_test, d):
    params = {
        'n_estimators': 1000,
        'max_depth': d,
        'min_samples_split': 1,
        'warm_start': True,
        'oob_score': True
    }
    clf = RandomForestRegressor(**params)
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)
    r = rmse(y_test, y_predict)
    #fig(weiboid,y_test,y_predict)
    return y_predict, r
def RandomForest(x_train,y_train,x_test,y_test):
     degree = [1,2,3,4,7]
     result = {}
     rmse_list = []
     for d in degree:
          params = {'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1,'warm_start':True}
          clf = RandomForestRegressor(**params)
          clf.fit(x_train[:, np.newaxis], y_train)
          y_predict = clf.predict(x_test[:, np.newaxis])
          rmsevalue = rmse(y_test,y_predict)
          result[rmsevalue] = [y_predict,d]
          rmse_list.append(rmsevalue)
     rmseMin = min(rmse_list)     
     return rmsevalue,result[rmseMin]
    def test_overwrite_ensemble_model_invalid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()

        model_builder = MLPModelBuilder()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)
        masking_operation = ZeroMasking()
        loss = binary_crossentropy
        num_models = 5
        explainer = CXPlain(explained_model,
                            model_builder,
                            masking_operation,
                            loss,
                            num_models=num_models)

        file_names = [
            CXPlain.get_config_file_name(),
            CXPlain.get_explained_model_file_name(".pkl"),
            CXPlain.get_loss_pkl_file_name(),
            CXPlain.get_model_builder_pkl_file_name(),
            CXPlain.get_masking_operation_pkl_file_name()
        ]

        # Test with untrained explanation model.
        for file_name in file_names:
            tmp_dir = TestExplanationModel.make_at_tmp(file_name)
            with self.assertRaises(ValueError):
                explainer.save(tmp_dir, overwrite=False)

        # Test with trained explanation model.
        explainer.fit(x_train, y_train)

        file_names = [
            CXPlain.get_config_file_name(),
            CXPlain.get_explained_model_file_name(".pkl"),
            CXPlain.get_loss_pkl_file_name(),
            CXPlain.get_model_builder_pkl_file_name(),
            CXPlain.get_masking_operation_pkl_file_name()
        ] + [
            CXPlain.get_prediction_model_h5_file_name(i)
            for i in range(num_models)
        ]

        for file_name in file_names:
            tmp_dir = TestExplanationModel.make_at_tmp(file_name)
            with self.assertRaises(ValueError):
                explainer.save(tmp_dir, overwrite=False)
Beispiel #25
0
def doPrediction(locations, data, columns, features, columns2, outputFileName):
    predictionData = {}
    for c in columns2:
        predictionData[c] = []

    # modelling
    for location in locations:
        trainX, testX, trainY, testY, dataY = splitDataForXValidation(
            location, "location", data, features, columns, "target")
        print("\tT+W #train: " + str(len(trainY)) + ", #test:" +
              str(len(testY)))
        model = RandomForestRegressor(min_samples_leaf=2,
                                      n_estimators=650,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        rmse = rmseEval(testY, prediction)[1]
        print("\trmse: " + str(rmse))

        for c in columns2:
            if c == 'prediction':
                predictionData[c].extend(prediction)
            else:
                predictionData[c].extend(dataY[c])

    for c in predictionData:
        print("\t" + c + " -> #" + str(len(predictionData[c])))

    rmse = rmseEval(predictionData['target'], predictionData['prediction'])[1]
    print("overall RMSE: " + str(rmse))

    print("Writing out results...")

    output = open(outputFileName, 'w')
    output.write(','.join([str(x) for x in columns2]))
    output.write("\n")

    for i in range(0, len(predictionData['target'])):
        output.write(str(predictionData[columns2[0]][i]))
        for j in range(1, len(columns2)):
            output.write(",")
            output.write(str(predictionData[columns2[j]][i]))
        output.write("\n")

    output.close()

    print("Done...")
Beispiel #26
0
    def test_boston_housing_valid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=3,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error

        for num_models in [2, 5, 10]:
            explainer = CXPlain(explained_model,
                                model_builder,
                                masking_operation,
                                loss,
                                num_models=num_models)

            explainer.fit(x_train, y_train)
            eval_score = explainer.score(x_test, y_test)
            train_score = explainer.get_last_fit_score()
            median, confidence = explainer.predict(x_test,
                                                   confidence_level=0.95)

            self.assertTrue(median.shape == x_test.shape)
            self.assertTrue(confidence.shape == x_test.shape + (2, ))

            # Flatten predictions for iteration below.
            median = median.reshape((len(x_test), -1))
            confidence = confidence.reshape((len(x_test), -1, 2))

            for sample_idx in range(len(x_test)):
                for feature_idx in range(len(x_test[sample_idx])):
                    self.assertTrue(confidence[sample_idx][feature_idx][0] <=
                                    median[sample_idx][feature_idx] <=
                                    confidence[sample_idx][feature_idx][1])
                    self.assertTrue(
                        confidence[sample_idx][feature_idx][0] >= 0)
                    self.assertTrue(
                        confidence[sample_idx][feature_idx][1] >= 0)
Beispiel #27
0
def perform_random_forest_regressor(train_set, train_target, test_set, predictors, estimators=10, depth=None, splits=2):
    alg = RandomForestRegressor(random_state=1)
    alg.fit(train_set[predictors], train_target)
    
    #importances = alg.feature_importances_
    #print("Original ",numpy.argsort(importances))
    #indices = numpy.argsort(importances)[::-1]
    #print (" importances ",importances)
    #print (" indices ",indices)
    
    #for f in range(train_set.shape[1]-2):
    #    print("%2d) %-*s %f" % (f+1,30,predictors[indices[f]],
    #                                    importances[indices[f]]))

    predictions = alg.predict(test_set[predictors])
    return predictions;
Beispiel #28
0
class QuantileForestRegression(absmodel.Module):
    def __init__(self, n_estimator=500):
        super(QuantileForestRegression, self).__init__()
        self.model = RandomForestRegressor(n_estimators=n_estimator)

        self.fitted = False

    def _fit(self, x, y, verbose=False, load=False):

        return self.model.fit(x, y)

    def predict(self, x, y, label=None):

        d, up = self.pred_ints(model=self.model, x=x)
        return d, up

    def pred_ints(self, model, x, percentile=95):
        err_down = []
        err_up = []
        for i in range(len(x)):
            preds = []
            for pred in model.estimators_:
                preds.append(pred.predict(x[i])[0])
            err_down.append(np.percentile(preds, (100 - percentile) / 2.))
            err_up.append(np.percentile(preds, 100 - (100 - percentile) / 2.))
        return err_down, err_up
Beispiel #29
0
def calcRandomForest(channels_training, channels_testing, target_training,
                     target_testing):
    clf = RandomForestRegressor(n_estimators=500,
                                max_features=len(channels_training[0]))
    clf = clf.fit(channels_training, target_training)
    predictions = clf.predict(channels_testing)
    comp = [predictions, target_testing]
    return clf, comp
def main(train_file='train.csv', test_file='test.csv', output_file='predict.csv'):
    print "Loading data..."
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    y = np.array(train_data[["ACTION"]])
    #X = np.array(train_data.ix[:,1:-1])     # Ignores ACTION, ROLE_CODE
    X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]])
    X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE
 
    SEED = 4
    #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y)
    
    
    
    clf = RandomForestRegressor(n_estimators=300, min_samples_split=15, min_density=0.1,compute_importances=True).fit(X,y)

    print clf.feature_importances_
    #Try feature selection
    
    mean_auc = 0.0
    n = 10
    for i in range(n):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.10, random_state=i*SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it
        
        # train model and make predictions
        clf.fit(X_train, y_train) 
        preds = clf.predict(X_cv)

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc
    
    print "Mean AUC: %f" % (mean_auc/n)
    predictions = clf.predict_(X_test)
    #print predictions
    
    #print 'Writing predictions to %s...' % (output_file)
    create_test_submission(output_file, predictions)

    return 0
Beispiel #31
0
def rf(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014,
       data2013, data2014):

    columns = []
    for c in data2013:
        columns.append(c)

    columns.remove("location")
    columns.remove("timestamp")
    columns.remove("target")

    X = []
    y = []

    for i in range(0, len(data2013["target"])):
        timestamp = str(int(data2013["timestamp"][i]))
        weekC = timestampWeekCategory[timestamp]
        if int(weekC) >= week:
            y.append(data2013["target"][i])
            x = []
            for c in columns:
                x.append(data2013[c][i])
            X.append(x)

    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(X, y)

    #     print(str(len(X)))

    X = []
    y = []

    for i in range(0, len(data2014["target"])):
        y.append(data2014["target"][i])
        x = []
        for c in columns:
            x.append(data2014[c][i])
        X.append(x)

    prediction = model.predict(X)
    rmse = rmseEval(y, prediction)
    return rmse
Beispiel #32
0
    def run(self):
        # extract data from the batch
        df_train = pd.read_csv(self.input().path, header=[0, 1])

        X, y = preprocess2(df_train, snr=10.)
        # train regressor
        reg = RandomForestRegressor(10, min_samples_leaf=10, max_depth=9,
                                    n_jobs=-1)
        # reg = KNeighborsRegressor(algorithm="auto")
        # reg = LinearRegression()
        # reg = sklearn.svm.SVR(kernel="rbf", degree=3, C=100., gamma=10.)
        # reg = LinearSaO2Unmixing()
        reg.fit(X, y.values)
        # reg = LinearSaO2Unmixing()
        # save regressor
        regressor_file = self.output().open('w')
        pickle.dump(reg, regressor_file)
        regressor_file.close()
    def run(self):
        # extract data from the batch
        df_train = pd.read_csv(self.input().path, header=[0, 1])

        X, y = preprocess2(df_train, snr=10.)
        # train regressor
        reg = RandomForestRegressor(10, min_samples_leaf=10, max_depth=9,
                                    n_jobs=-1)
        # reg = KNeighborsRegressor(algorithm="auto")
        # reg = LinearRegression()
        # reg = sklearn.svm.SVR(kernel="rbf", degree=3, C=100., gamma=10.)
        # reg = LinearSaO2Unmixing()
        reg.fit(X, y.values)
        # reg = LinearSaO2Unmixing()
        # save regressor
        regressor_file = self.output().open('w')
        pickle.dump(reg, regressor_file)
        regressor_file.close()
Beispiel #34
0
    def make_models(self, missing_columns):

        available_table = self.full_table.copy()
        #clear out the table
        for column in missing_columns:
            del available_table[column]
        available_features = available_table.as_matrix()

        clfs = {}
        #build a model for each missing column
        for column in missing_columns:
            labels = self.full_table.as_matrix(columns = [column])
            labels = np.reshape(labels, (len(labels))) #unnest the arrays
            clf = RandomForestRegressor(n_estimators = 100)
            clf.fit(available_features, labels, available_table['WGTP'])
            clfs[column] = clf

        return clfs
Beispiel #35
0
def eval_one(features):

    all_predictions = []
    all_observations = []

    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, features, "target")
        model = RandomForestRegressor(min_samples_leaf=2,
                                      random_state=42,
                                      n_estimators=650,
                                      n_jobs=-1)
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        all_observations.extend(testY)
        all_predictions.extend(predictions)

    rmse = rmseEval(all_observations, all_predictions)[1]
    log("\tRMSE: " + str(rmse))
Beispiel #36
0
 def modeltrain(X_train, y_train, X_test, y_test):
     from sklearn.ensemble.forest import RandomForestRegressor
     # Generando el modelo 
     RF_Model = RandomForestRegressor(n_estimators=100,max_features=1)
     # Ajustando el modelo con X_train y y_train
     rgr = RF_Model.fit(X_train, y_train)
     y_train_predict = (rgr.predict(X_train)).astype(int)
     y_test_predict = (rgr.predict(X_test)).astype(int)
     
     return y_train_predict ,  y_test_predict , rgr
Beispiel #37
0
    def test_fmt_sklearn_preds_regression(self):
        """test fmt_sklearn_preds on regression case"""

        modelobj_regr = RandomForestRegressor()

        model_df =self.df.loc[:, self.df.columns != 'target']

        modelobj_regr.fit(model_df,
                          self.df.loc[:, 'target'])

        fmtd_outputs = fmt_model_outputs.fmt_sklearn_preds(getattr(modelobj_regr, 'predict'),
                                            modelobj_regr,
                                            model_df,
                                            self.df,
                                            'target',
                                            'regression')

        self.assertIn('predictedYSmooth',
                      fmtd_outputs.columns.values,
                      """fmt_sklearn_preds on regression case does not return predictions""")
Beispiel #38
0
def predict_per_cpu_full():
    data, target = load_data()
    data, target, labels = normalize_data(data, target)

    data = data[['C0', 'cpuFull']]
    data['target'] = target
    split_by_types = dict()

    cpu_groups = data.groupby('cpuFull')
    for name, group in cpu_groups:
        X_train, X_test, y_train, y_test = train_test_split(group['C0'].reshape(-1, 1), group['target'])
        split_by_types[str(name)] = {
            'train': {
                'data': X_train,
                'target': y_train
            },
            'test': {
                'data': X_test,
                'target': y_test
            }
        }

    # print split_by_types
    summ = 0.0
    for cpu, data_set in split_by_types.iteritems():
        plt.figure()
        # reg = SGDRegressor(loss='huber', n_iter=100, alpha=0.0)
        reg = RandomForestRegressor(n_estimators=5)
        reg.fit(data_set['train']['data'], data_set['train']['target'])
        test_data = data_set['test']['data']
        y_pred = reg.predict(test_data)
        print mape(data_set['test']['target'], y_pred), cpu
        plt.scatter(test_data, data_set['test']['target'], s=3, color='g', label='actual')
        plt.scatter(test_data, y_pred, s=3, color='r', label='predicted')
        plt.legend(loc='upper left')
        plt.ylabel('mul time')
        plt.title('Category: {}'.format(cpu))
        plt.savefig('imgs/{}.png'.format(cpu))
def train(data,val_ind,indices):
    
    max_numb = val_ind.shape[1]
    
    regs = []
    for i in range(max_numb):
        regs.append(0)
        
    for i in indices:
#        print i
#        reg = sklearn.linear_model.Lasso(max_iter=3000)
        reg = RandomForestRegressor()
#        reg=skl.tree.DecisionTreeRegressor()
#        reg = skl.linear_model.LinearRegression()
#        reg = AdaBoostRegressor()
#        print val_ind.shape
#        print val_ind[:,i]
#        print data.shape
#        print data[0]
#        print len(val_ind[:,i])
        reg.fit(data,val_ind[:,i])
        regs[i]=reg
        
    return regs
Beispiel #40
0
import pandas as pd 
from sklearn.ensemble.forest import RandomForestRegressor
import time

dset = pd.read_csv("./data/concrete_data.csv")
X = dset.iloc[:, 0:7]
y = dset.iloc[:, 8]


estimator = RandomForestRegressor(max_features = 3, n_estimators = 50, n_jobs = 1, oob_score = True)

t0 = time.time()
estimator.fit(X, y)
print(time.time() - t0)

# fit a linear model with no bells and whistles
model = linear_model.LinearRegression()
model.fit(train_X, train_Y)

# look at the r squared on the training data and the test data
model.score(train_X, train_Y)
model.score(test_X, test_Y)

# See if I can get the r squared on the test data lower by using more complex models
# random forest
forest = RandomForestRegressor()

# fit the data without using cross val to select parameters
# note that train score is much higher than test score
forest.fit(train_X, train_Y)
forest.score(train_X, train_Y)
forest.score(test_X, test_Y)


# fit a random forest regressor using grid search to 
# select the number of trees and max depth
new_forest = RandomForestRegressor()
params_grid = [{'max_depth': [3, 5,10, None], 'n_estimators': [5,10,15,20, 50, 80]} ]
grid_search = GridSearchCV(new_forest, params_grid, cv=10)
grid_search.fit(train_X, train_Y)
grid_search.score(test_X, test_Y)
grid_search.best_estimator_

# fit a boosted regression
boost = GradientBoostingRegressor()
# Look at scatter plot of OTU abundance vs. age to visualize the correlation 
fig, ax= plt.subplots()
ax.scatter(x, y)
ax.set_xlabel('OTU #' + otu)
ax.set_ylabel('Age')
ax.text(0.01,0.95, r'$\rho$ = {:.2f}'.format(r), transform=ax.transAxes)

#%% 3. Build a Random Forest Regressor

## 3.1 Build the regressor
rfreg = RandomForestRegressor(n_estimators=1000, oob_score=True)

# We aren't classifying samples here, so we can just use the whole OTU table to build our regression
X = data.abun_df.values
Y = [float(data.meta_df.loc[smpl, 'BMI']) for smpl in data.abun_df.index]
rfreg = rfreg.fit(X,Y)

## 3.1.1 Look at true vs. predicted values from out of bag estimations
fig, ax = plt.subplots()
ax.scatter(Y, rfreg.oob_prediction_)
ax.set_xlabel('True')
ax.set_ylabel('Predicted')
ax.set_title('RF regression on BMI')

## 3.2 Look at the important features in the regression by inspecting their coefficient weights
feats = pd.DataFrame(index=data.abun_df.columns, columns=['importance'], data=rfreg.feature_importances_)
feats = feats.sort(columns='importance', ascending=False)
feats['normalized_importance'] = feats['importance']/max(feats['importance'])
# Look at the top 5 features and their importance. Each row name is the OTU ID
feats.head(5)
Beispiel #43
0
# Initiate the monthly trade object
monthData = trade_model.monthlyModel(1, 2009, 6, 2013, 6, 2012, 6, 2013)
# Download data from Yahoo finance
monthData.monthlyDataDownload()
# Pre-processing of training an testing data
monthData.trainFeaturePre()
# Read pre-processed data from hard drive
# monthData.trainFeaturePreHd()
# Number of training months
trainSpan = len(monthData.xTrain[:,0,0]) - monthData.testSpan
# Initiate a random forest regressor
clf = RandomForestRegressor(n_estimators=10)
#
totalReturn = 1
predictedReturn = np.zeros(monthData.stockNum)
monthlyReturn = np.zeros(monthData.testSpan)
aggReturn = np.zeros(monthData.testSpan+1)
aggReturn[0] = 1
# rolling training and testing
for j in range(0, monthData.testSpan):
    for i in range(0, monthData.stockNum):
        clf.fit(monthData.xTrain[j:trainSpan+j, :, i], monthData.yTrain[j:trainSpan+j, 0, i])
        predictedReturn[i] = clf.predict(monthData.xTest[j, :, i])
    monthlyReturn[j] = monthData.por10Returns(j, predictedReturn)
    yearReturn = totalReturn * (monthlyReturn[j]+1)
    aggReturn[j+1] = aggReturn[j]*(1+monthlyReturn[j])

print monthlyReturn
print 'overall:', totalReturn
sp.portfolioVSspy(6, 2012, 6, 2013, aggReturn[1:])
Beispiel #44
0
 nucleus = 'N' # make command line option; supported nuclei are ['H','N','CA','HA','CB','C']
 
 # Generate training and test set
 X_train,y_train = OrganizeData(nucleus, 'train')
 X_test, y_test = OrganizeData(nucleus, 'test')
 
 # Feature scaling
 X_train_scaled = preprocessing.scale(X_train)
 X_test_scaled = preprocessing.scale(X_test)
         
 # Set the parameters for the random forest estimator    
 estimator = RandomForestRegressor(n_estimators=50, max_features=16, max_depth=25,
 				min_samples_split=5, min_samples_leaf=5, random_state=0)
 
 # Build the random forest of regression trees from the training set
 estimator = estimator.fit(X_train_scaled,y_train)
 
 print estimator.score(X_train_scaled,y_train)
 print estimator.score(X_test_scaled,y_test)
     
 # Predict regression target for the test set
 predicted = estimator.predict(X_train_scaled)
 cc = np.corrcoef(y_train,predicted)
 print cc
 print estimator
 #my_plotting.simple_plot_overlay(y_train,predicted)
 
 predicted = estimator.predict(X_test_scaled)
 cc = np.corrcoef(y_test,predicted)
 print cc
 print estimator
 def fit(self, X, y, sample_weight=None):
     sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight, pow_sig=self.pow_sig,
                                      pow_bg=self.pow_bg)
     target = sample_weight + self.gap
     target[y == 0] *= -1
     RandomForestRegressor.fit(self, X, y, sample_weight=sample_weight)
Beispiel #46
0
    newtrain = pd.DataFrame(newtrain, columns = cols)
    newtest = pd.DataFrame(newtest, columns = cols)
    
    #test = test.join(pd.DataFrame(test.Date.apply(splitTime).tolist(), columns = ['year','mon','day']))
    #newtest = test.drop('StateHoliday',1).join(pd.get_dummies(test['StateHoliday']).rename(columns=lambda x: 'StateHoliday' +"_"+str(x)))  
    #newtest = pd.merge(newtest,store, on="Store")
    #newtest.drop(['Date'],axis = 1,inplace=True) 
    
    #assert(np.sum(newtrain.var()==0)==0)
    #
    #toDrop = list(set(newtrain.columns.values)-set(newtest.columns.values) )
    features = [col for col in newtrain.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes']]
    #
    rf = RandomForestRegressor(n_estimators=100)
    print('Starting training...')
    rf.fit(newtrain[features].fillna(-1),newtrain.LogSale)
    print('Predicting train values...')
    newtrain['mypred'] = rf.predict(newtrain[features].fillna(-1))
    newtrain['mypred'] = np.exp(newtrain['mypred'])-1
    train_error = rmspe(newtrain[newtrain.Sales>0].Sales,newtrain[newtrain.Sales>0].mypred)
    print('train set error',train_error)
    newtest['mypred'] = rf.predict(newtest[features].fillna(-1))
    newtest['mypred'] = np.exp(newtest['mypred'])-1
    test_error = rmspe(newtest[newtest.Sales>0].Sales,newtest[newtest.Sales>0].mypred)
    print('test set error',test_error)
    train_results.append(train_error)
    test_results.append(test_error)

print('mean train error', np.mean(train_results))
print('mean test error',np.mean(test_results))
    all_data_valid, all_targets_valid = generate_array(hdulist_valid,
                                                       feature_index,
                                                       target_index)

    clf_adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8), n_estimators=50,
                                                           loss='linear', random_state=0)
    clf_extra_trees = ExtraTreesRegressor(n_estimators=8, random_state=0, max_depth=30)
    clf_random_forest = RandomForestRegressor(n_estimators=8, random_state=0, max_depth=30)

    clf_adaboost.fit(all_data_test.T, all_targets_test[0])
    predicted = clf_adaboost.predict(all_data_valid.T)

    clf_extra_trees.fit(all_data_test.T, all_targets_test[0])
    predicted_extra = clf_extra_trees.predict(all_data_valid.T)

    clf_random_forest.fit(all_data_test.T, all_targets_test[0])
    predicted_forest = clf_random_forest.predict(all_data_valid.T)


    delta_ada = all_targets_valid[0] - predicted
    delta_extra = all_targets_valid[0] - predicted_extra
    delta_forest = all_targets_valid[0] - predicted_forest
    std_ada = get_standart_deviation(delta_ada)
    std_extra = get_standart_deviation(delta_extra)
    std_forest = get_standart_deviation(delta_forest)

    plt.hist(delta_ada, bins=150, color='g', label='Adaboost '+str(np.round(std_ada,4)))
    plt.hist(delta_extra, bins=150, color='b', label='Extra_Trees '+str(np.round(std_extra,4)))
    plt.hist(delta_forest, bins=150, color='r', label='Random_Forest '+str(np.round(std_forest,4)))
    title = "Compare adaboost, extra_tree and Random_Forests"
    plt.title(title)
predictions3 = []
predictions4 = []
offset = int(0.7 * len(X))

for i in range(10):
    X, y = shuffle(boston.data, boston.target)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140)
    regressor2 = DecisionTreeRegressor(max_depth=6)
    regressor3 = LinearRegression()
    regressor4 = RandomForestRegressor()
    regressor.fit(X_train, y_train)
    regressor2.fit(X_train, y_train)
    regressor3.fit(X_train, y_train)
    regressor4.fit(X_train, y_train)
    y_pred = regressor.predict(x)
    y_pred2 = regressor2.predict(x)
    y_pred3 = regressor3.predict(x)
    y_pred4 = regressor4.predict(x)
    predictions.append(y_pred)
    predictions2.append(y_pred2)
    predictions3.append(y_pred3)
    predictions4.append(y_pred4)
    print "\nPrediction = " + str(y_pred)
    print "Prediction = " + str(y_pred2)
    print "Prediction = " + str(y_pred3)
    print "Prediction = " + str(y_pred4)

print '\n'
print 'Boosting max', np.max(predictions), 'min', np.min(predictions), 'variance', np.max(predictions) - np.min(predictions)
class ItemSetModel(object):
    """docstring for ItemSetModel"""

    clf = None

    MODEL_PATH = os.path.join(settings.BASE_DIR, 'set_analyzer', 'analysis', 'models')
    CACHE_FILE = os.path.join(MODEL_PATH, 'model_cache.cache')

    def __init__(self):
        super(ItemSetModel, self).__init__()
        #self.clf = DecisionTreeRegressor()
        #self.clf = Lasso(0.1)
        #self.clf = SVR(kernel='rbf')
        #self.clf = ElasticNetCV()
        self.clf = RandomForestRegressor(max_depth=7, n_estimators=10)


    def get_data_sets(self, num_matches, cache=False, **kwargs):
        """
        Data Schema:
            Input:
                1    My champion ID
                6    My Champion's class info
                6    [Other team's cumulative class info]
                7    [7 Final Items]
                5    [first 5 items purchased]
                ________________________________________
                25 features

            Output:
                Score = A(Gold/time) + B(xp/time) + C(win)
                ________________________________________
                1 Output

        """

        #Presize data
        features = 25
        num_participants = num_matches*10
        input_data = np.zeros((num_participants, features))
        output_data = np.zeros(num_participants)

        row_num = 0

        get_champ_id = lambda x : x.champion.champion_id
        diff_team = lambda x , y : x.team_id != y.team_id
        item_purchased = lambda x: x.event_type == "ITEM_PURCHASED"

        #Iterate over every match in the database
        for match in Match.objects(**kwargs)[:num_matches]:

            #Prepare users and teams
            team_map = {}
            team_data = np.zeros((2,6))     #Store the sum of each team's tags
            count = 0
            for tag in match.teams:
                team_map[int(tag)] = count
                count+=1

            #Prepare champion class data
            for p in match.participants.values():
                for tag in p.champion.tags:
                    team_data[team_map[p.team_id], :] += np.array(p.champion.class_data)

            #Iterate over every user in the match
            for pid, participant in match.participants.items():
                col_num = 0

                #My Champion's info
                input_data[row_num][col_num] = get_champ_id(participant)
                col_num+=1
                input_data[row_num][col_num:col_num+6] = np.array(participant.champion.class_data)
                col_num+=6

                #Other Team's champion attributes
                if(team_map[participant.team_id] == 0):
                    input_data[row_num][col_num:col_num+6] = team_data[1,:]
                else:
                    input_data[row_num][col_num:col_num+6] = team_data[0,:]
                col_num+=6

                #My items
                for item_id in participant.final_build:
                    input_data[row_num][col_num] = item_id
                    col_num+=1

                #My Item purchases
                count = 0
                for item_purchase in (x for x in participant.item_events if item_purchased(x)):
                    if(count==5):
                        break
                    input_data[row_num][col_num] = item_purchase.payload['itemId']
                    col_num += 1
                    count += 1

                #Score
                #   Assume that average gold/sec is ~8
                #   Assume that average kda is ~2.6
                #   Have a game win worth some bonus
                score = participant.kda()*3 + participant.gold_earned/match.duration +  (4 if match.teams[str(participant.team_id)].won else 0)
                output_data[row_num] = score

                row_num+=1

        if(cache):
            print('Caching data...')
            self.cache_data((input_data, output_data))

        return (input_data, output_data)

    def cache_data(self, data):
        with open(self.CACHE_FILE, 'wb') as f:
            pickle.dump(data, f)

    def get_cached_data(self, num_rows):
        with open(self.CACHE_FILE, 'rb') as f:
            return pickle.load(f)[:num_rows]

    def train(self, X, Y, train_ratio=1, **kwargs):

        print("Training model...")
        if(train_ratio==1):
            print("Using {} rows".format(len(X)))
            self.clf.fit(X,Y)
        else:
            n = len(X)
            tn = int(n*train_ratio)
            print("Using {} rows".format(tn))
            self.clf.fit(X[:tn,:],Y[:tn])
            print("Evaluating model...")
            evaluate_fit(self.clf, X[tn:,:],Y[tn:])

    def predict(self, X):
        return self.clf.predict(X)

    #MODEL EVAUATION
    def k_fold(self, folds, **kwargs):
        X, Y = self.get_data_sets(**kwargs)
        k_fold_evaluate(self.clf, X, y, folds)

    #LOAD AND SAVE
    def save(self, filename):
        dirname = os.path.join(self.MODEL_PATH, filename)
        if(not os.path.exists(dirname)):
            os.makedirs(dirname)
        else:       #Empty folder
            for file in os.listdir(dirname):
                file_path = os.path.join(dirname, file)
                if os.path.isfile(file_path):
                    os.unlink(file_path)

        path = os.path.join(dirname, "{}.pkl".format(filename))
        joblib.dump(self.clf, path)

    def load(self, filename):
        path = os.path.join(self.MODEL_PATH, filename, "{}.pkl".format(filename))
        self.clf = joblib.load(path)
store = store.drop("Assortment", 1).join(
    pd.get_dummies(store["Assortment"]).rename(columns=lambda x: "Assortment" + "_" + str(x))
)

train["StateHoliday"] = [mychange(x) for x in train.StateHoliday]
test["StateHoliday"] = [mychange(x) for x in test.StateHoliday]

train = train.drop("StateHoliday", 1).join(
    pd.get_dummies(train["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)
test = test.drop("StateHoliday", 1).join(
    pd.get_dummies(test["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)

train = pd.merge(train, store, on="Store")
test = pd.merge(test, store, on="Store")

repeat = 1
print("Splitting data...")
for i in range(repeat):
    features = [col for col in test.columns if col not in ["Customers", "Sales", "Date", "LogSale", "datetimes", "Id"]]
    rf = RandomForestRegressor(n_estimators=100)
    print("Starting training...")
    rf.fit(train[features].fillna(-1), train.LogSale)

    test["mypred"] = rf.predict(test[features].fillna(-1))
    test["mypred"] = np.exp(test["mypred"]) - 1

test["Sales"] = test.mypred
test[["Id", "Sales"]].to_csv("rand_for_kag_v4-9.csv", index=False)
Beispiel #51
0
def ts_rf(n,fea,step,ntrees,njobs):
    #Random Forest Model for time series prediction
    #from sklearn import svm
    import math
    from sklearn import metrics
    import matplotlib.pyplot as plt
    from scipy.linalg import hankel        
    import numpy as np
    from sklearn.ensemble.forest import RandomForestRegressor
    #input data from csv file
    #use n datapoints
    #n=1100
    #    # of features of training set
    ##        fre=50
    #    # how many steps to predict
    #step=29
    #fea=50
    path='/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls.txt'
    path1 = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls_nor.txt'
    result_tem=[]
    date = []
    with open(path) as f:
        next(f)
        for line in f:
            item=line.replace('\n','').split(' ')
            result_tem.append(float(item[1]))
            date.append(item[2])
    mean = np.mean(result_tem)
    sd = np.std(result_tem)
    result=(result_tem-mean)/sd
    #form hankel matrix
    X=hankel(result[0:-fea-step+1], result[-1-fea:-1])
    y=result[fea+step-1:]
    #split data into training and testing
    Xtrain=X[:n]
    ytrain=y[:n]
    Xtest=X[n:]
    ytest=y[n:]
    # random forest
    rf = RandomForestRegressor(n_estimators = ntrees, n_jobs=njobs)
    rf_pred = rf.fit(Xtrain, ytrain).predict(Xtest)
    #a = rf.transform(Xtrain,'median')
    
    #plot results
    LABELS = [x[-6:] for x in date[n+fea+step-1:n+fea+step-1+len(ytest)]]    
    t=range(n,n+len(ytest))
    #    plt.show()
    #    plt.plot(t,y_lin1,'r--',t,ytest,'b^-')
    #    plt.plot(t,y_lin2,'g--',t,ytest,'b^-')
    ypred = rf_pred*sd+mean
    ytest = ytest*sd+mean
    line1, = plt.plot(t,ypred,'r*-')
    plt.xticks(t, LABELS)
    line2, = plt.plot(t,ytest,'b*-')
#            plt.xlim([500,510])
    plt.legend([line1, line2], ["Predicted", "Actual"], loc=2)
        
        
        
        
        #plt.show()
        #plt.plot(xrange(n),result[0:n],'r--',t,y_lin3,'b--',t,ytest,'r--')
        
        
    y_true = ytest
    y_pred = ypred
    metrics_result = {'rf_MAE':metrics.mean_absolute_error(y_true, y_pred),'rf_MSE':metrics.mean_squared_error(y_true, y_pred),
                      'rf_MAPE':np.mean(np.abs((y_true - y_pred) / y_true)) * 100}	
    print metrics_result
Beispiel #52
0
def train(training, k):
    model = RandomForestRegressor(n_estimators=k, n_jobs=-1)
    model.fit(training[:,:-1], training[:,-1])
    return model
Beispiel #53
0
import json

household = pd.read_csv("../household_complete_one_hot.csv")
if 'KWH' in household.columns:
    del household['KWH']

X_columns = [column for column in household.columns if column != "ELEP"]
X = household.as_matrix(columns = X_columns)
y = [label[0] for label in household.as_matrix(columns = ["ELEP"])]

#print(y)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 100, n_jobs = 8)
clf.fit(X_train, y_train)


print(y_test[:100])
print(metrics.mean_squared_error(clf.predict(X_test), y_test))
print(metrics.r2_score(y_test, clf.predict(X_test)))

features = sorted(zip(X_columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)

#fill spaces in ELEP
normalized_pums = pd.read_csv("../joined_weather.csv", delimiter = ',')
print('pums shape', normalized_pums.shape)

with open("../vectorized_puma_regions/puma_list.json") as f:
    puma_mapping = json.load(f)
Beispiel #54
0
auto = auto_mapper.fit_transform(auto_df)

store_pkl(auto_mapper, "Auto.pkl")

auto_X = auto[:, 0:7]
auto_y = auto[:, 7]

print(auto_X.dtype, auto_y.dtype)

def predict_auto(regressor):
    mpg = DataFrame(regressor.predict(auto_X), columns = ["mpg"])
    return mpg

auto_tree = DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5)
auto_tree.fit(auto_X, auto_y)

store_pkl(auto_tree, "DecisionTreeAuto.pkl")
store_csv(predict_auto(auto_tree), "DecisionTreeAuto.csv")

auto_forest = RandomForestRegressor(random_state = 13, min_samples_leaf = 5)
auto_forest.fit(auto_X, auto_y)

store_pkl(auto_forest, "RandomForestAuto.pkl")
store_csv(predict_auto(auto_forest), "RandomForestAuto.csv")

auto_regression = LinearRegression()
auto_regression.fit(auto_X, auto_y)

store_pkl(auto_regression, "RegressionAuto.pkl")
store_csv(predict_auto(auto_regression), "RegressionAuto.csv")
train['LogSale'] = np.log(train.Sales+1)

train=pd.merge(train, store, on="Store")  
test = pd.merge(test, store, on="Store")

processdata(train)
processdata(test)


repeat = 1
#print('Splitting data...')
for i in range(repeat):
    features = [col for col in test.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes','Id']] ##!!!for submission should be test.columns!!!
#    features = ['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2',\
# 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'DayOfWeek', 'mon', 'day', 'year', 'StoreType', 'Assortment']
 # ^^ features taken from xgb model on Kaggle
    rf = RandomForestRegressor(n_estimators=100)
    print('Starting training...')
    rf.fit(train[features],train.LogSale)
#    train['mypred'] = rf.predict(train[features])
#    train['mypred'] = np.expm1(train.mypred)
#    train_error = rmspe(train[train.Sales>0].Sales,train[train.Sales>0].mypred)
#    print(train_error)
 
    test['mypred'] = rf.predict(test[features])
    test['mypred'] = np.exp(test['mypred'])-1

test['Sales'] = test.mypred
test[[ 'Id', 'Sales' ]].to_csv('rand_for_kag_v4-8.csv', index = False )
def train_random_forest(X, Y):
    rf = RandomForestRegressor(n_estimators=20)
    rf.fit(X, Y)
    return rf
# In[11]:

print 'Train Random Forests!'

from sklearn.ensemble.forest import RandomForestRegressor
RF = RandomForestRegressor(n_estimators = 500, random_state = 0)


# In[12]:

Rows = np.random.choice(Train.index.values, 400000)
Sampled_Train = Train.ix[Rows]
Sample_Train_Target = Train_Target.ix[Rows]

# RF.fit(Sampled_Train, Sample_Train_Target)
RF.fit(Train, Train_Target)


# In[ ]:

print 'Predict!'

Test_Predict = RF.predict(Test.as_matrix())


# In[ ]:

print Test_Predict.shape


# In[ ]:
Beispiel #58
0
clf.fit(X, y)   
clf.predict(z)
#########################
from sklearn.ensemble.forest import RandomForestRegressor
regressor = RandomForestRegressor()
parameters = [{"n_estimators": [250, 500, 1000,2000]}]

# Returns the best configuration for a model using crosvalidation
# and grid search

import time


regressor = RandomForestRegressor(n_estimators=300, min_samples_split=1,max_features=67)

regressor.fit(train_np,energy)
pred=regressor.predict(test_np)

print explained_variance_score(energy_test,pred)
print mean_squared_error(energy_test,pred)
r2_score(energy_test,pred)




##prediction comparison
comp = pd.read_csv("H:/bee-efficiency/cisco presentation/pred.csv")



def _2011x2011_ (data_path):

    ##### LOADING #####
    sys.stdout.write("Loading data... ")

    # Load data from .csv file
    with open(data_path+'_X.csv') as data_file:
        reader = csv.reader(data_file)

        # Initialize lists for data and class labels
        data =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            data.append([float(x) for x in row])

    with open(data_path+'_y.csv') as labels_file:
        reader = csv.reader(labels_file)

        # Initialize lists for data and class labels
        val_ind =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            val_ind.append(row)

    sys.stdout.write("done\n")


    ##### TRAINING #####
    # splitting
    data_train, data_test, val_ind_train, val_ind_test \
        = skl.cross_validation.train_test_split(data, val_ind, test_size=0.4, random_state=42)

    # Cutting date/ ASS/ number value from labels
    date_train = [x[0] for x in val_ind_train]
#    ASS_train = [x[1] for x in val_ind_train]
    val_train = [float(x[1]) for x in  val_ind_train]
    date_test = [x[0] for x in val_ind_test]
#    ASS_test = [x[1] for x in val_ind_test]
    val_test = [float(x[1]) for x in val_ind_test]

    sys.stdout.write("Training regressor... ")
    reg = RandomForestRegressor()
#    reg = skl.tree.DecisionTreeRegressor()
#    reg = skl.linear_model.LinearRegression()
    reg.fit(data_train, val_train)
    sys.stdout.write("done\n")


    ##### PREDICTION #####
    sys.stdout.write("Predicting... ")
    val_predicted = reg.predict(data_test)
    sys.stdout.write("done\n")

    ##### ERROR #####
    df = pd.DataFrame()
    df['date'] = pd.to_datetime(date_test)
#    df['ASS'] = ASS_test
    df['original'] = val_test
    df['predicted'] = val_predicted.tolist()
    df = df.set_index('date')

#    df = df.loc[df['ASS'] == 'CAT'] # one example
    
    df.info()
    
    df.plot()
    plt.show()
    
    print "MSE : " + str(mean_squared_error(val_test,val_predicted.tolist()))