def RandomForest(x_train,y_train,x_test,degree):    
     params = {'n_estimators': 1000, 'max_depth': degree, 'min_samples_split': 1,'warm_start':True}
     clf = RandomForestRegressor(**params)
     clf.fit(x_train, y_train)          
     y_predict = clf.predict(x_test)
     #plt.plot(x_test,y_predict,color='red')
     return y_predict
コード例 #2
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, all_features, "target")
        if "depth" in parameters:
            model = RandomForestRegressor(
                max_depth=parameters["depth"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)
        elif "leaf" in parameters:
            model = RandomForestRegressor(
                min_samples_leaf=parameters["leaf"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)
        elif "max_leaf" in parameters:
            model = RandomForestRegressor(
                max_leaf_nodes=parameters["max_leaf"],
                random_state=42,
                n_estimators=parameters["n_estimators"],
                n_jobs=-1)

        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        all_obs.extend(testY)
        all_pred.extend(prediction)
    return rmseEval(all_obs, all_pred)[1]
コード例 #3
0
def eval_one(min_samples_leaf, n_estimators):
    log("min_samples_leaf: " + str(min_samples_leaf) + ", n_estimators: " +
        str(n_estimators))

    all_observations = []
    all_pred_ALL = []

    for group in range(0, len(groups)):
        trainStations = []
        for i in range(0, len(groups)):
            if i != group:
                trainStations.extend(groups[i])
        testStations = groups[group]

        train_station_set = set([float(s) for s in trainStations])
        test_station_set = set([float(s) for s in testStations])

        trainX, testX, trainY, testY = splitDataForXValidation(
            train_station_set, test_station_set, "location", data,
            all_features, "target")
        model = RandomForestRegressor(min_samples_leaf=min_samples_leaf,
                                      n_estimators=n_estimators,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX, trainY)
        prediction_ALL = model.predict(testX)
        rmse = rmseEval(testY, prediction_ALL)[1]
        log("\tALL rmse: " + str(rmse))
        all_observations.extend(testY)
        all_pred_ALL.extend(prediction_ALL)

    rmse = rmseEval(all_observations, all_pred_ALL)[1]
    log("\tALL rmse:" + str(rmse))
    return rmse
コード例 #4
0
def eval_one(step):
    
    if step in cached_results:
        return cached_results[step]
    
    eval_features = []
    for i in range(0, len(all_features)):
        if step[i]:
            eval_features.append(all_features[i])
    
    all_predictions = []
    all_observations = []
    
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, eval_features, "target")
        model = RandomForestRegressor(min_samples_leaf = 2, random_state=42, n_estimators=650, n_jobs=-1)
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        all_observations.extend(testY)
        all_predictions.extend(predictions)
    
    rmse = rmseEval(all_observations, all_predictions)[1]
    
    cached_results[step] = rmse
    
    # save down the cached result
    
    cache_output = open(CACHE_FILE, "a")
    step_list = [str(s) for s in step]
    step_str = ",".join(step_list)  
    cache_output.write(str(rmse) + ";" + step_str + "\n")
    cache_output.close()
    
    return rmse
コード例 #5
0
def RF_ST(trainFileName, testFilename):
    trainData = ld.LoadData_DATA_ST(trainFileName)
    testData = ld.LoadData_DATA_ST(testFilename)

    store = ['1', '2', '3', '4', '5']
    res = []

    for i in store:
        train_X = []
        train_y = []
        context = trainData[i]
        for array in context:
            array = [float(x) for x in array[2:]]
            train_X.append((array[2:-1]))
            train_y.append(array[-1])

        test_X = []
        items = []
        context = testData[i]
        for array in context:
            items.append((array[0], array[1]))
            array = [float(x) for x in array[2:]]
            test_X.append((array[2:]))


        clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\
                    fit(train_X,train_y)
        pred_y = clf.predict(test_X)

        for i in range(len(pred_y)):
            res.append([items[i][0], items[i][1], '%.4f' % max(pred_y[i], 0)])
    return res
コード例 #6
0
ファイル: RF.py プロジェクト: emigmo/TC_CAINIAO
def RF_ST(trainFileName,testFilename):
    trainData = ld.LoadData_DATA_ST(trainFileName)
    testData = ld.LoadData_DATA_ST(testFilename)
    
    store = ['1','2','3','4','5']
    res = []
    
    for i in store:
        train_X = [];train_y = []
        context = trainData[i]
        for array in context:
            array = [float(x) for x in array[2:]]
            train_X.append((array[2:-1]))
            train_y.append(array[-1])
            
        test_X = [];items = []
        context = testData[i]
        for array in context:
            items.append((array[0],array[1]))
            array = [float(x) for x in array[2:] ]
            test_X.append((array[2:]))
            
         
        clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\
                    fit(train_X,train_y)
        pred_y = clf.predict(test_X)
         
        for i in range(len(pred_y)):
            res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)])
    return res
コード例 #7
0
ファイル: DynamicHedge.py プロジェクト: fengyihen/Quant
    def randomforestregressor(self, testlen, ntrain, ntrees, nodes):
        hsmadata = self.hsmadata
        dates = pd.Series(hsmadata['date'].unique()).sort_values()
        dates.index = range(0, len(dates))
        ntest = len(dates) // testlen

        hsma = pd.DataFrame()
        for i in range(ntrain, ntest):
            traindata = hsmadata[
                (hsmadata['date'] >= dates[(i - ntrain) * testlen])
                & (hsmadata['date'] < dates[i * testlen - self.day])].copy()
            testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen]) & (
                hsmadata['date'] < dates[(i + 1) * testlen])].copy()

            traindata = traindata.iloc[:, 2:]
            traindatax = traindata.drop(['closeratio'], 1)
            traindatay = traindata['closeratio']
            testdatax = testdata[traindatax.columns]

            treemodel = RandomForestRegressor(
                n_estimators=ntrees,
                min_samples_split=nodes * 2,
                min_samples_leaf=nodes)
            treemodel.fit(traindatax, traindatay)
            testdata['predratio'] = treemodel.predict(testdatax)

            hsma = pd.concat([hsma, testdata], ignore_index=True)

        return (hsma)
コード例 #8
0
    def post(self):

        # upload audio file in server
        voice = self.request.files["audio"][0]
        extn = os.path.splitext(voice['filename'])[1]
        fnm = os.path.splitext(voice['filename'])[0]
        cname = str(uuid.uuid4()) + extn
        fh = open(__UPLOADS__ + cname, 'w')
        fh.write(voice['body'])
        fh.close()

        # get features from the audio file
        attr = getAttributes(cname)
        fdf = mongoTolist(False)

        train = fdf[:,:-1]
        target = fdf[:,-1]

        #RandomForest Regression
        rf = RandomForestRegressor(n_estimators = 506, n_jobs = -1)
        rf.fit(train, target)

        updrs_val = rf.predict([attr])
        attr.append(updrs_val[0])

        # get the theta from database
        theta = list(db.theta.find({}))
        theta1 = theta[0]["theta1"]
        theta2 = theta[1]["theta2"]

        # check is the person has Parkinson's Disease
        isParkinson = octave.classify(theta1, theta2, np.array(attr))

        self.render("output.html", ipk = isParkinson, updrs = updrs_val[0])
コード例 #9
0
class RandomForestRegressorImpl():

    def __init__(self, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
コード例 #10
0
ファイル: sandkasten2.py プロジェクト: jdroenner/fls2
def calcRandomForest(channels_training, channels_testing, target_training,
                     target_testing):
    clf = RandomForestRegressor(n_estimators=500,
                                max_features=len(channels_training[0]))
    clf = clf.fit(channels_training, target_training)
    predictions = clf.predict(channels_testing)
    comp = [predictions, target_testing]
    return clf, comp
コード例 #11
0
def randomForest(trainFeatures, trainResponses, testFeatures, maxFeatures = 'log2', nTree=100):
    ## Settings of random forests regressor
    regModel = RandomForestRegressor(n_estimators=nTree, max_features=maxFeatures)    
    ## Train the random forests regressor
    regModel.fit(trainFeatures, trainResponses)
    ## Prediction
    testResponsesPred = regModel.predict(testFeatures)
    return testResponsesPred
コード例 #12
0
def evalTrainStationTestStation(trainStation, testStation, features):
    trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target")
    _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target")
    model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42)
    model.fit(trainX, trainY)
    prediction = model.predict(testX2)
    rmse = rmseEval(testY2, prediction)[1]
    print("Training on station " + str(trainStation) + ", applying on station " + str(testStation) + ": rmse: " + str(rmse))
    return rmse
コード例 #13
0
ファイル: RF.py プロジェクト: emigmo/TC_CAINIAO
def RF_ALL(trainFileName,testFileName):
    train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName)
    Eval_X, items = ld.LoadData_DATA_ITEM(testFileName)
    clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\
            fit(train_X, train_y)
    pred_y = clf.predict(Eval_X)
    res = []
    for i in range(len(Eval_X)):
        res.append([items[i],'all','%.4f'%max(pred_y[i],0)])
    return res
コード例 #14
0
def RF_ALL(trainFileName, testFileName):
    train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName)
    Eval_X, items = ld.LoadData_DATA_ITEM(testFileName)
    clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\
            fit(train_X, train_y)
    pred_y = clf.predict(Eval_X)
    res = []
    for i in range(len(Eval_X)):
        res.append([items[i], 'all', '%.4f' % max(pred_y[i], 0)])
    return res
コード例 #15
0
def ML(features, targets, fig_num):
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        targets,
                                                        train_size=0.8,
                                                        random_state=42)

    #Preprocessing
    scaler = StandardScaler().fit(X_train)
    #Scale and construct new dataframes from sklearn numpy array output
    X_train_scaled = pd.DataFrame(scaler.transform(X_train),
                                  index=X_train.index.values,
                                  columns=X_train.columns.values)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test),
                                 index=X_test.index.values,
                                 columns=X_test.columns.values)

    rf = RandomForestRegressor(n_estimators=500,
                               oob_score=True,
                               random_state=0)
    rf.fit(X_train, y_train)
    predicted_test = rf.predict(X_test)

    rf.fit(X_train_scaled, y_train)
    predicted_test_scaled = rf.predict(X_test_scaled)

    test_score = r2_score(y_test, predicted_test)
    test_score_scaled = r2_score(y_test, predicted_test_scaled)
    spearman = spearmanr(y_test, predicted_test)
    spearman_scaled = spearmanr(y_test, predicted_test_scaled)
    pearson = pearsonr(y_test, predicted_test)
    pearson_scaled = pearsonr(y_test, predicted_test_scaled)
    print(
        "R-squared: %1.4f, Scaled R-squared: %1.4f, \n Spearman: %1.4f, Scaled Spearman: %1.4f \n Pearson: %1.4f, Scaled Pearson: %1.4f"
        % (test_score, test_score_scaled, spearman[0], spearman_scaled[0],
           pearson[0], pearson_scaled[0]))

    plt.figure(fig_num)
    plt.scatter(y_test, predicted_test, label='unscaled')
    plt.scatter(y_test, predicted_test_scaled, label='scaled')
    plt.legend()
def RandomForest(x_train,y_train,x_test,y_test):
     degree = [1,2,3,4,7]
     result = {}
     rmse_list = []
     for d in degree:
          params = {'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1,'warm_start':True}
          clf = RandomForestRegressor(**params)
          clf.fit(x_train[:, np.newaxis], y_train)
          y_predict = clf.predict(x_test[:, np.newaxis])
          rmsevalue = rmse(y_test,y_predict)
          result[rmsevalue] = [y_predict,d]
          rmse_list.append(rmsevalue)
     rmseMin = min(rmse_list)     
     return rmsevalue,result[rmseMin]
コード例 #17
0
def RandomForest(weiboid, x_train, y_train, x_test, y_test, d):
    params = {
        'n_estimators': 1000,
        'max_depth': d,
        'min_samples_split': 1,
        'warm_start': True,
        'oob_score': True
    }
    clf = RandomForestRegressor(**params)
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)
    r = rmse(y_test, y_predict)
    #fig(weiboid,y_test,y_predict)
    return y_predict, r
コード例 #18
0
def train_model(X_train, y_train):
    print("training the model ...")
    rf = RandomForestRegressor(n_estimators=500,
                               max_depth=5,
                               n_jobs=-1,
                               verbose=2)
    rf.fit(X_train, y_train)
    y_pred_train = rf.predict(X_train)
    print(".. training RMSE : {:0.3f} %".format(
        mean_squared_error(y_train, y_pred_train) * 100))
    #print(".. training R2   : {:0.3f} %".format(r2_score(y_train,y_pred_train)*100))
    print(".. training MAE  : {:0.3f} %".format(
        mean_absolute_error(y_train, y_pred_train) * 100))
    return rf
コード例 #19
0
def doPrediction(locations, data, columns, features, columns2, outputFileName):
    predictionData = {}
    for c in columns2:
        predictionData[c] = []

    # modelling
    for location in locations:
        trainX, testX, trainY, testY, dataY = splitDataForXValidation(
            location, "location", data, features, columns, "target")
        print("\tT+W #train: " + str(len(trainY)) + ", #test:" +
              str(len(testY)))
        model = RandomForestRegressor(min_samples_leaf=2,
                                      n_estimators=650,
                                      n_jobs=-1,
                                      random_state=42)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        rmse = rmseEval(testY, prediction)[1]
        print("\trmse: " + str(rmse))

        for c in columns2:
            if c == 'prediction':
                predictionData[c].extend(prediction)
            else:
                predictionData[c].extend(dataY[c])

    for c in predictionData:
        print("\t" + c + " -> #" + str(len(predictionData[c])))

    rmse = rmseEval(predictionData['target'], predictionData['prediction'])[1]
    print("overall RMSE: " + str(rmse))

    print("Writing out results...")

    output = open(outputFileName, 'w')
    output.write(','.join([str(x) for x in columns2]))
    output.write("\n")

    for i in range(0, len(predictionData['target'])):
        output.write(str(predictionData[columns2[0]][i]))
        for j in range(1, len(columns2)):
            output.write(",")
            output.write(str(predictionData[columns2[j]][i]))
        output.write("\n")

    output.close()

    print("Done...")
コード例 #20
0
def perform_random_forest_regressor(train_set, train_target, test_set, predictors, estimators=10, depth=None, splits=2):
    alg = RandomForestRegressor(random_state=1)
    alg.fit(train_set[predictors], train_target)
    
    #importances = alg.feature_importances_
    #print("Original ",numpy.argsort(importances))
    #indices = numpy.argsort(importances)[::-1]
    #print (" importances ",importances)
    #print (" indices ",indices)
    
    #for f in range(train_set.shape[1]-2):
    #    print("%2d) %-*s %f" % (f+1,30,predictors[indices[f]],
    #                                    importances[indices[f]]))

    predictions = alg.predict(test_set[predictors])
    return predictions;
def main(train_file='train.csv', test_file='test.csv', output_file='predict.csv'):
    print "Loading data..."
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    y = np.array(train_data[["ACTION"]])
    #X = np.array(train_data.ix[:,1:-1])     # Ignores ACTION, ROLE_CODE
    X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]])
    X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE
 
    SEED = 4
    #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y)
    
    
    
    clf = RandomForestRegressor(n_estimators=300, min_samples_split=15, min_density=0.1,compute_importances=True).fit(X,y)

    print clf.feature_importances_
    #Try feature selection
    
    mean_auc = 0.0
    n = 10
    for i in range(n):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.10, random_state=i*SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it
        
        # train model and make predictions
        clf.fit(X_train, y_train) 
        preds = clf.predict(X_cv)

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc
    
    print "Mean AUC: %f" % (mean_auc/n)
    predictions = clf.predict_(X_test)
    #print predictions
    
    #print 'Writing predictions to %s...' % (output_file)
    create_test_submission(output_file, predictions)

    return 0
コード例 #22
0
def rf(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014,
       data2013, data2014):

    columns = []
    for c in data2013:
        columns.append(c)

    columns.remove("location")
    columns.remove("timestamp")
    columns.remove("target")

    X = []
    y = []

    for i in range(0, len(data2013["target"])):
        timestamp = str(int(data2013["timestamp"][i]))
        weekC = timestampWeekCategory[timestamp]
        if int(weekC) >= week:
            y.append(data2013["target"][i])
            x = []
            for c in columns:
                x.append(data2013[c][i])
            X.append(x)

    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(X, y)

    #     print(str(len(X)))

    X = []
    y = []

    for i in range(0, len(data2014["target"])):
        y.append(data2014["target"][i])
        x = []
        for c in columns:
            x.append(data2014[c][i])
        X.append(x)

    prediction = model.predict(X)
    rmse = rmseEval(y, prediction)
    return rmse
コード例 #23
0
def eval_one(features):

    all_predictions = []
    all_observations = []

    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, features, "target")
        model = RandomForestRegressor(min_samples_leaf=2,
                                      random_state=42,
                                      n_estimators=650,
                                      n_jobs=-1)
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        all_observations.extend(testY)
        all_predictions.extend(predictions)

    rmse = rmseEval(all_observations, all_predictions)[1]
    log("\tRMSE: " + str(rmse))
コード例 #24
0
ファイル: trainer.py プロジェクト: blvp/ml_tasks
def predict_per_cpu_full():
    data, target = load_data()
    data, target, labels = normalize_data(data, target)

    data = data[['C0', 'cpuFull']]
    data['target'] = target
    split_by_types = dict()

    cpu_groups = data.groupby('cpuFull')
    for name, group in cpu_groups:
        X_train, X_test, y_train, y_test = train_test_split(group['C0'].reshape(-1, 1), group['target'])
        split_by_types[str(name)] = {
            'train': {
                'data': X_train,
                'target': y_train
            },
            'test': {
                'data': X_test,
                'target': y_test
            }
        }

    # print split_by_types
    summ = 0.0
    for cpu, data_set in split_by_types.iteritems():
        plt.figure()
        # reg = SGDRegressor(loss='huber', n_iter=100, alpha=0.0)
        reg = RandomForestRegressor(n_estimators=5)
        reg.fit(data_set['train']['data'], data_set['train']['target'])
        test_data = data_set['test']['data']
        y_pred = reg.predict(test_data)
        print mape(data_set['test']['target'], y_pred), cpu
        plt.scatter(test_data, data_set['test']['target'], s=3, color='g', label='actual')
        plt.scatter(test_data, y_pred, s=3, color='r', label='predicted')
        plt.legend(loc='upper left')
        plt.ylabel('mul time')
        plt.title('Category: {}'.format(cpu))
        plt.savefig('imgs/{}.png'.format(cpu))
コード例 #25
0
def evaluateFeatures(vector, features, data):
    featureToUse = []
    for i in range(len(vector)):
        if vector[i] == 1:
            featureToUse.append(features[i])

    combinedRmse = []

    # modelling
    for location in locationValues:

        trainX, testX, trainY, testY = splitDataForXValidation2(
            location, "location", data, featureToUse, "target")

        model = RandomForestRegressor(min_samples_leaf=9,
                                      n_estimators=59,
                                      n_jobs=-1,
                                      random_state=42)

        model.fit(trainX, trainY)

        prediction = model.predict(testX)

        rmse = rmseEval(testY, prediction)

        combinedRmse.append(rmse[1])

    # calculate avg rmse

    avgRmse = 0.0
    for rmse in combinedRmse:
        avgRmse = avgRmse + rmse

    avgRmse = avgRmse / len(combinedRmse)

    return avgRmse
コード例 #26
0
ファイル: rfr2.py プロジェクト: myeongkil/infosec_kmk
def learn_rfr(str_json):
    param = json.loads(str_json)
    m_n_estimators = param["n_estimators"]
    m_criterion = param["criterion"]
    m_random_state = param["random_state"]
    predict_col = param["predict_col"]
    features = param["features"]
    print m_n_estimators
    print m_criterion
    print m_random_state
    for feature in features:
        print feature

    df = pd.read_csv('/home/kaka/Data/building/building.csv', header=0)

    df_ratio = int(len(df) * 0.7)
    df_train = df.iloc[0:df_ratio, 0::]
    df_test = df.iloc[df_ratio:, 0::]

    actual_data = np.array(df.iloc[df_ratio:, predict_col]).astype(float)
    sample_data = np.array(df_train.iloc[0::, predict_col]).astype(str)
    set_train = df_train.loc[0::, features[0::]].astype(str)
    set_test = df_test.loc[0::, features[0::]].astype(str)

    model_rf = RandomForestRegressor(n_estimators=3,
                                     criterion='mse',
                                     random_state=0)
    model_rf = model_rf.fit(set_train, sample_data)
    predicted_data = model_rf.predict(set_test).astype(float)

    np.savetxt("/home/kaka/Data/building/building_predicted.csv",
               predicted_data,
               delimiter=",")
    np.savetxt("/home/kaka/Data/building/building_actual.csv",
               actual_data,
               delimiter=",")
コード例 #27
0
ファイル: main.py プロジェクト: lsahlstr/Rebuild_LarmorCA
    X_test, y_test = OrganizeData(nucleus, 'test')
    
    # Feature scaling
    X_train_scaled = preprocessing.scale(X_train)
    X_test_scaled = preprocessing.scale(X_test)
            
    # Set the parameters for the random forest estimator    
    estimator = RandomForestRegressor(n_estimators=50, max_features=16, max_depth=25,
    				min_samples_split=5, min_samples_leaf=5, random_state=0)
    
    # Build the random forest of regression trees from the training set
    estimator = estimator.fit(X_train_scaled,y_train)
    
    print estimator.score(X_train_scaled,y_train)
    print estimator.score(X_test_scaled,y_test)
        
    # Predict regression target for the test set
    predicted = estimator.predict(X_train_scaled)
    cc = np.corrcoef(y_train,predicted)
    print cc
    print estimator
    #my_plotting.simple_plot_overlay(y_train,predicted)
    
    predicted = estimator.predict(X_test_scaled)
    cc = np.corrcoef(y_test,predicted)
    print cc
    print estimator
    #my_plotting.simple_plot_overlay(y_test,predicted)    
    
    # score = cross_val_score(estimator, X_train, y_train)
#     print score 
コード例 #28
0
train['LogSale'] = np.log(train.Sales+1)

train=pd.merge(train, store, on="Store")  
test = pd.merge(test, store, on="Store")

processdata(train)
processdata(test)


repeat = 1
#print('Splitting data...')
for i in range(repeat):
    features = [col for col in test.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes','Id']] ##!!!for submission should be test.columns!!!
#    features = ['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2',\
# 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'DayOfWeek', 'mon', 'day', 'year', 'StoreType', 'Assortment']
 # ^^ features taken from xgb model on Kaggle
    rf = RandomForestRegressor(n_estimators=100)
    print('Starting training...')
    rf.fit(train[features],train.LogSale)
#    train['mypred'] = rf.predict(train[features])
#    train['mypred'] = np.expm1(train.mypred)
#    train_error = rmspe(train[train.Sales>0].Sales,train[train.Sales>0].mypred)
#    print(train_error)
 
    test['mypred'] = rf.predict(test[features])
    test['mypred'] = np.exp(test['mypred'])-1

test['Sales'] = test.mypred
test[[ 'Id', 'Sales' ]].to_csv('rand_for_kag_v4-8.csv', index = False )
コード例 #29
0
def compute_metrics_with_RandomForest(latents,
                                      factors,
                                      err_fn=nrmse,
                                      params={
                                          "n_estimators": 10,
                                          "max_depth": 8
                                      },
                                      cont_mask=None):
    """
    :param latents: (N, z_dim). They use E_q(z|x)[z]
    :param factors: (N, K)
    :param err_fn: Error function
    :param params: Parameters of LASSO
    :return:
    """

    assert len(latents.shape) == len(factors.shape) == 2, \
        "'latents' and 'factors' must be 2D arrays!"
    assert len(latents) == len(
        factors), "'latents' and 'factors' must have the same length!"

    num_factors = factors.shape[1]

    R = []
    train_errors = []

    if not cont_mask:
        cont_mask = [True] * num_factors
    else:
        assert len(cont_mask) == num_factors, "len(cont_mask)={}".format(
            len(cont_mask))

    print(
        "Training Random Forest regressor for {} factors!".format(num_factors))
    for k in tqdm(range(num_factors)):
        if cont_mask:
            print("Factor {} is continuous. Process it!".format(k))

            # (N, )
            factors_k = factors[:, k]
            model = RandomForestRegressor(**params)
            model.fit(latents, factors_k)

            # (N, )
            factors_k_pred = model.predict(latents)

            # Scalar
            train_errors.append(err_fn(factors_k_pred, factors_k))

            # Get the weight of the linear regressor, whose shape is (num_latents, 1)
            R.append(np.abs(model.feature_importances_[:, None]))
        else:
            print("Factor {} is not continuous. Do not process it!".format(k))

    # (num_latents, num_factors)
    R = np.concatenate(R, axis=1)
    assert R.shape[1] == np.sum(np.cast(cont_mask, dtype=np.int32)), \
        "R.shape={} while #cont={}".format(
            R.shape[1], np.sum(np.cast(cont_mask, dtype=np.int32)))

    # Disentanglement: (num_latents,)
    disentanglement_scores = entropic_scores(R.T)
    c_rel_importance = np.sum(R, axis=1) / np.sum(
        R)  # relative importance of each code variable
    assert 1 - 1e-4 < np.sum(c_rel_importance) < 1 + 1e-4, \
        "c_rel_importance: {}".format(c_rel_importance)
    disentanglement = np.sum(disentanglement_scores * c_rel_importance)

    # Completeness
    completeness_scores = entropic_scores(R)
    completeness = np.mean(completeness_scores)

    # Informativeness
    train_avg_error = np.mean(train_errors)

    results = {
        'importance_matrix': R,
        'disentanglement_scores': disentanglement_scores,
        'disentanglement': disentanglement,
        'completeness_scores': completeness_scores,
        'completeness': completeness,
        'train_errors': train_errors,
        'train_avg_error': train_avg_error,
    }

    return results
コード例 #30
0
ファイル: python-svm.py プロジェクト: shamol84/python-code
n_samples, n_features = 100, 5
y = np.random.randn(n_samples)
X = np.random.randn(n_samples, n_features)
z = np.random.randn(20, 5)
z1 = np.random.randn(20)
clf.fit(X, y)
clf.predict(z)
#########################
from sklearn.ensemble.forest import RandomForestRegressor
regressor = RandomForestRegressor()
parameters = [{"n_estimators": [250, 500, 1000, 2000]}]

# Returns the best configuration for a model using crosvalidation
# and grid search

import time

regressor = RandomForestRegressor(n_estimators=300,
                                  min_samples_split=1,
                                  max_features=67)

regressor.fit(train_np, energy)
pred = regressor.predict(test_np)

print explained_variance_score(energy_test, pred)
print mean_squared_error(energy_test, pred)
r2_score(energy_test, pred)

##prediction comparison
comp = pd.read_csv("H:/bee-efficiency/cisco presentation/pred.csv")
コード例 #31
0
ファイル: python-svm.py プロジェクト: shamol84/python-code
clf.predict(z)
#########################
from sklearn.ensemble.forest import RandomForestRegressor
regressor = RandomForestRegressor()
parameters = [{"n_estimators": [250, 500, 1000,2000]}]

# Returns the best configuration for a model using crosvalidation
# and grid search

import time


regressor = RandomForestRegressor(n_estimators=300, min_samples_split=1,max_features=67)

regressor.fit(train_np,energy)
pred=regressor.predict(test_np)

print explained_variance_score(energy_test,pred)
print mean_squared_error(energy_test,pred)
r2_score(energy_test,pred)




##prediction comparison
comp = pd.read_csv("H:/bee-efficiency/cisco presentation/pred.csv")




    
コード例 #32
0
#del household['ST']
#del household['DIVISION']
#del household['ELEP']

#if 'CDD' in household.columns:
#    del household['CDD']
#    del household['HDD']
X = household.as_matrix()
X = np.nan_to_num(X)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 10, n_jobs = 8)
clf.fit(X_train, y_train)

print(metrics.mean_squared_error(y_test, clf.predict(X_test)))
print(metrics.r2_score(y_test, clf.predict(X_test)))

predictions = clf.predict(X_test)[:50]
'''
features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)
'''
pums = pd.read_csv("../joined_weather.csv")
pums = pums.sample(1000)
pums_puma_vector = pums.as_matrix(columns = ['PUMA'])
left_matrix = pums[['PUMA', 'WGTP', 'SERIALNO']]
del pums['PUMA']
del pums['WGTP']
del pums['SERIALNO']
コード例 #33
0
def _2011x2011_ (data_path):

    ##### LOADING #####
    sys.stdout.write("Loading data... ")

    # Load data from .csv file
    with open(data_path+'_X.csv') as data_file:
        reader = csv.reader(data_file)

        # Initialize lists for data and class labels
        data =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            data.append([float(x) for x in row])

    with open(data_path+'_y.csv') as labels_file:
        reader = csv.reader(labels_file)

        # Initialize lists for data and class labels
        val_ind =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            val_ind.append(row)

    sys.stdout.write("done\n")


    ##### TRAINING #####
    # splitting
    data_train, data_test, val_ind_train, val_ind_test \
        = skl.cross_validation.train_test_split(data, val_ind, test_size=0.4, random_state=42)

    # Cutting date/ ASS/ number value from labels
    date_train = [x[0] for x in val_ind_train]
#    ASS_train = [x[1] for x in val_ind_train]
    val_train = [float(x[1]) for x in  val_ind_train]
    date_test = [x[0] for x in val_ind_test]
#    ASS_test = [x[1] for x in val_ind_test]
    val_test = [float(x[1]) for x in val_ind_test]

    sys.stdout.write("Training regressor... ")
    reg = RandomForestRegressor()
#    reg = skl.tree.DecisionTreeRegressor()
#    reg = skl.linear_model.LinearRegression()
    reg.fit(data_train, val_train)
    sys.stdout.write("done\n")


    ##### PREDICTION #####
    sys.stdout.write("Predicting... ")
    val_predicted = reg.predict(data_test)
    sys.stdout.write("done\n")

    ##### ERROR #####
    df = pd.DataFrame()
    df['date'] = pd.to_datetime(date_test)
#    df['ASS'] = ASS_test
    df['original'] = val_test
    df['predicted'] = val_predicted.tolist()
    df = df.set_index('date')

#    df = df.loc[df['ASS'] == 'CAT'] # one example
    
    df.info()
    
    df.plot()
    plt.show()
    
    print "MSE : " + str(mean_squared_error(val_test,val_predicted.tolist()))
コード例 #34
0
ファイル: losses.py プロジェクト: particleist/lhcb_trigger_ml
 def predict(self, X):
     return RandomForestRegressor.predict(self, X)[:, numpy.newaxis]
コード例 #35
0
# In[12]:

Rows = np.random.choice(Train.index.values, 400000)
Sampled_Train = Train.ix[Rows]
Sample_Train_Target = Train_Target.ix[Rows]

# RF.fit(Sampled_Train, Sample_Train_Target)
RF.fit(Train, Train_Target)


# In[ ]:

print 'Predict!'

Test_Predict = RF.predict(Test.as_matrix())


# In[ ]:

print Test_Predict.shape


# In[ ]:

from collections import OrderedDict

Submission = pd.DataFrame(data = OrderedDict([('Id', Test_ID), ('Sales', Test_Predict)]))
Submission.to_csv('Submission_RF.csv', index = False)

コード例 #36
0
store = store.drop("Assortment", 1).join(
    pd.get_dummies(store["Assortment"]).rename(columns=lambda x: "Assortment" + "_" + str(x))
)

train["StateHoliday"] = [mychange(x) for x in train.StateHoliday]
test["StateHoliday"] = [mychange(x) for x in test.StateHoliday]

train = train.drop("StateHoliday", 1).join(
    pd.get_dummies(train["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)
test = test.drop("StateHoliday", 1).join(
    pd.get_dummies(test["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)

train = pd.merge(train, store, on="Store")
test = pd.merge(test, store, on="Store")

repeat = 1
print("Splitting data...")
for i in range(repeat):
    features = [col for col in test.columns if col not in ["Customers", "Sales", "Date", "LogSale", "datetimes", "Id"]]
    rf = RandomForestRegressor(n_estimators=100)
    print("Starting training...")
    rf.fit(train[features].fillna(-1), train.LogSale)

    test["mypred"] = rf.predict(test[features].fillna(-1))
    test["mypred"] = np.exp(test["mypred"]) - 1

test["Sales"] = test.mypred
test[["Id", "Sales"]].to_csv("rand_for_kag_v4-9.csv", index=False)
コード例 #37
0
    'hour', 'day_of_week', 'month', 'bank_holiday', 'race_day',
    'winddirection', 'windspeed', 'temperature', 'rain', 'pressure'
]

for location in locations:
    print("location: " + str(location))
    # save down trainX, trainY, testX, testY
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, columns, "target")
    print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPrediction = model.predict(testX)
    testRmse = str(rmseEval(testY, testPrediction)[1])
    print("\tRFR+All rmse: " + str(testRmse))

    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, columnsTW, "target")
    print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPrediction = model.predict(testX)
    testRmse = str(rmseEval(testY, testPrediction)[1])
    print("\tRFR+TW rmse: " + str(testRmse))
コード例 #38
0
ファイル: model.py プロジェクト: mjstevens777/energy-portal
        return False
    return True
    #return column in ['BDSP', 'RMSP', 'HFL', 'BLD', 'AGEP', 'NP', 'YBL', 'HINCP', 'HDD', 'CDD']

household = household[[column for column in household.columns if select_column(column)]]
X = household.as_matrix()
print(household.columns)
#X = household.as_matrix()

with open("kwh_model_features.json", "w") as f:
    json.dump(list(household.columns), f, indent = True)

print(y)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 50, n_jobs = 8)
clf.fit(X_train, y_train)

print(y_test[:100])
print(np.sqrt(metrics.mean_squared_error(y_test, clf.predict(X_test))))
print(metrics.r2_score(y_test, clf.predict(X_test)))


features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)


with open("kwh_model.pkl", 'wb') as f:
    pickle.dump(clf, f)
コード例 #39
0
    testStations = groups[group]
    log("\ttrainStations: " + str(trainStations))
    log("\ttestStations: " + str(testStations))

    train_station_set = set([float(s) for s in trainStations])
    test_station_set = set([float(s) for s in testStations])

    trainX, testX, trainY, testY = splitDataForXValidation(
        train_station_set, test_station_set, "location", data, tw_features,
        "target")
    model = RandomForestRegressor(min_samples_leaf=29,
                                  n_estimators=64,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction_TW = model.predict(testX)
    rmse = rmseEval(testY, prediction_TW)[1]
    log("\tTW rmse: " + str(rmse))
    all_observations.extend(testY)
    all_pred_TW.extend(prediction_TW)

    trainX, testX, trainY, testY = splitDataForXValidation(
        train_station_set, test_station_set, "location", data, twa_features,
        "target")
    model = RandomForestRegressor(min_samples_leaf=29,
                                  n_estimators=64,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction_TWA = model.predict(testX)
    rmse = rmseEval(testY, prediction_TWA)[1]
コード例 #40
0
ファイル: model.py プロジェクト: mjstevens777/energy-portal
    del household['KWH']

X_columns = [column for column in household.columns if column != "ELEP"]
X = household.as_matrix(columns = X_columns)
y = [label[0] for label in household.as_matrix(columns = ["ELEP"])]

#print(y)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 100, n_jobs = 8)
clf.fit(X_train, y_train)


print(y_test[:100])
print(metrics.mean_squared_error(clf.predict(X_test), y_test))
print(metrics.r2_score(y_test, clf.predict(X_test)))

features = sorted(zip(X_columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)

#fill spaces in ELEP
normalized_pums = pd.read_csv("../joined_weather.csv", delimiter = ',')
print('pums shape', normalized_pums.shape)

with open("../vectorized_puma_regions/puma_list.json") as f:
    puma_mapping = json.load(f)

reverse_puma_map = {}
for key, value in puma_mapping.items():
    reverse_puma_map[int(value)] = int(key)
コード例 #41
0
    ###

    RigeLinearCV = linear_model.RidgeCV(cv=10)
    rcv = RigeLinearCV.fit(x_train, y_train)
    y_pre_rcv = rcv.predict(x_val)
    ###
    params_rf = {
        'n_estimators': 500,
        'max_depth': 10,
        'min_samples_split': 2,
        'n_jobs': 4
    }

    rf = RandomForestRegressor(**params_rf)
    rf.fit(x_train, y_train)
    y_pre_rf = rf.predict(x_val)
    ###
    y_pre_diff = mean_normal_weekend_diff(Y[-14:-7], xday[-28:-14],
                                          xweekend[-28:-14], -7, 0)
    ###

    #Y_test.append(y_test)
    #y_pre_diff = mean_normal_weekend_diff(Y,xday,xweekend,-21,-14)

    ###
    loss_rcv = Evaluation([y_pre_rcv], [y_val])
    loss_rf = Evaluation([y_pre_rf], [y_val])
    loss_diffmean = Evaluation([y_pre_diff], [y_val])

    union = {loss_rcv: 1, loss_rf: 2, loss_diffmean: 3}
    minloss = min(union.keys())
コード例 #42
0
def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]
        
        print("Location: " + str(location) + ", location2: " + str(location2s))
        
        # generating testPreds
        testPreds = {}
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target")
                
            model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
            model.fit(trainX, trainY)
            prediction = model.predict(testX)
            testPreds[tag] = prediction
          
        trainPreds = defaultdict(list)
          
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            print("\ttag: " + str(tag) + ", features: " + str(features))
            for location2 in location2s:
                trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target")
                model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
                model.fit(trainX1, trainY1)
                train1Prediction = model.predict(trainX1)
                train2Prediction = model.predict(trainX2)
                testPrediction = model.predict(testX)
                train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
                train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
                testRmse = str(rmseEval(testY, testPrediction)[1])
                print("\t\ttrain1 rmse: " + train1Rmse)
                print("\t\ttrain2 rmse: " + train2Rmse)
                print("\t\ttest rmse: " + testRmse)
                for x in train2Prediction:
                    trainPreds[tag].append(x)

        # get combined train2y                
        combinedTrain2Y = []        
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            combinedTrain2Y = combinedTrain2Y + trainY2
          
        # calculate labels 
        labelTrain2Y = []
        for i in range(0, len(combinedTrain2Y)):
            bestModel = 0
            bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i])
            for j in range(0, len(topTags)):
                tag = topTags[j]
                modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestModel = j
            labelTrain2Y.append(bestModel)
            
        # generating testX
        _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target")

        # trainX2             
        tX2 = []
        for location2 in location2s:
            _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            for row in trainX2:
                tX2.append(row)
        
        for tag in topTags:
            for i in range(0, len(trainPreds[tag])):
                tX2[i].append(trainPreds[tag][i]) 
        
        reducedTrainX2 = []
        for d in tX2:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX2.append(reducedD)
              
        model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15)
        model.fit(reducedTrainX2, labelTrain2Y)
        
        for tag in topTags:
            for i in range(0, len(testPreds[tag])):
                testX[i].append(testPreds[tag][i]) 
        
        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)
         
        pred = model.predict(reducedTestX)
         
        finalPrediction = []
        for i in range(0, len(testY)):
            p = testPreds[topTags[pred[i]]][i]
            finalPrediction.append(p)      
        rmse = str(rmseEval(testY, finalPrediction)[1])
        print("\tRMSE: " + str(rmse))
        
        for x in testY:
            overallY.append(x)
        for x in finalPrediction:
            overallPred.append(x)
    
    rmse = rmseEval(overallPred, overallY)[1]
    return rmse
コード例 #43
0
def train_many_test_many(train_data_paths, test_data_paths, test_data_spec):

    test_run = 0

    if test_data_spec == TEST_ON_FAIL_ONLY:
        test_start_cycle = 0
        test_end_cycle = 0
        test_sample_freq = 0  # don't use any "good" examples
    elif test_data_spec == TEST_ON_FAIL_PLUS_CYCLE_ZERO:
        test_start_cycle = 0
        test_end_cycle = 1
        test_sample_freq = 1000  # use only "good" example from first cycle
    elif test_data_spec == TEST_ON_FAIL_PLUS_CYCLE_MAX:
        test_start_cycle = 9999999
        test_end_cycle = 9999999
        test_sample_freq = 1000  # use only "good" example from "max" cycle
    elif test_data_spec == TEST_ON_TRAIN_SPEC:
        test_start_cycle = start_cycle
        test_end_cycle = end_cycle
        test_sample_freq = sample_freq
    else:
        sys.err.write(
            "Invalid test_data_spec '%d'. Must be one of: TEST_ON_FAIL_ONLY, TEST_ON_FAIL_PLUS_CYCLE_ZERO, TEST_ON_FAIL_PLUS_CYCLE_MAX\n"
            % test_data_spec)

    # print output headers
    print "PERFORMANCE\test_data_spec\ttest_path\ttest_run\tpiston_param\tdensity_param\trmse\tfp\tfn\tnum_instances\truntime_secs"

    # load pickled model
    if enable_load_pickled_model:
        print 'Using pre-trained model from: randomforest.pkl. This may take a couple of minutes...'
        with open('randomforest.pkl', 'rb') as f:
            rand_forest = cPickle.load(f)
    else:
        train = None
        start = time.time()
        train_list = []
        for train_data_path in train_data_paths:
            print train_data_path
            train_next = get_learning_data(train_data_path, start_cycle,
                                           end_cycle, sample_freq,
                                           decay_window)
            train_list.append(train_next)
        train = np.concatenate(train_list, axis=0)
        print "training data: ", train.shape

        end = time.time()
        print "TIME load training data: ", end - start

        # Train the random forest
        train_X = train[:, 0:-1]
        train_Y = np.ravel(train[:, [-1]])
        start = time.time()
        rand_forest = RandomForestRegressor(n_estimators=NumTrees,
                                            n_jobs=parallelism,
                                            random_state=rand_seed)
        rand_forest.fit(train_X, train_Y)
        end = time.time()
        print "TIME train: ", end - start

    # output feature importance
    if enable_feature_importance:
        output_feature_importance(rand_forest, train_data_paths[0])

    # pickle model for future use
    if enable_save_pickled_model:
        print "Writing random forest model to file: randomforest.pkl. This may take a couple of minutes..."
        with open('randomforest.pkl', 'wb') as f:
            cPickle.dump(rand_forest, f)
        print "Wrote random forest model to file: randomforest.pkl"

    for test_path in test_data_paths:

        start = time.time()

        piston_param = 0
        density_param = 0
        try:
            (index, test) = get_learning_data_for_run(test_path,
                                                      test_start_cycle,
                                                      test_end_cycle,
                                                      test_sample_freq,
                                                      decay_window, test_run)
            print "test data: ", test.shape

            # Check results on cv set
            test_X = test[:, 0:-1]
            test_Y = np.ravel(test[:, [-1]])
            cv_predict = rand_forest.predict(test_X)
            #decision_boundary = min(cv_predict)
            decision_boundary = 4e-6
            RMSE = np.sqrt(sum(pow(test_Y - cv_predict, 2)) / test_Y.size)
            #err = sum(cv_predict - test_Y) / test_Y.size
            #pos_indices = [i for i, x in enumerate(test_Y) if x > 0]
            #neg_indices = [i for i, x in enumerate(test_Y) if x == 0]
            #err_on_pos = sum(np.array([cv_predict[i] for i in pos_indices]) - np.array([test_Y[i] for i in pos_indices])) / len(pos_indices)
            #err_on_neg = sum(np.array([cv_predict[i] for i in neg_indices]) - np.array([test_Y[i] for i in neg_indices])) / len(neg_indices)

            # calculate false positives and false negatives
            fp = fn = 0
            for i in range(len(test_Y)):
                if test_Y[i] == 0 and cv_predict[i] > decision_boundary:
                    fp += 1
                elif test_Y[i] > 0 and cv_predict[i] <= decision_boundary:
                    fn += 1

            end = time.time()

            if enable_print_predictions:
                for i in range(len(test_Y)):
                    print test_Y[i], cv_predict[i]

            if "piston" in test_path:
                piston_offset = test_path.find("piston") + len("piston")
                piston_param = int(test_path[piston_offset:piston_offset + 3])
                density_offset = test_path.find("density") + len("density")
                density_param = float(test_path[density_offset:density_offset +
                                                4])

            print "PERFORMANCE\t%d\t%s\t%d\t%d\t%.2f\t%.15f\t%d\t%d\t%d\t%d" % (
                test_data_spec, test_path, test_run, piston_param,
                density_param, RMSE, fp, fn, len(test_Y), round(end - start))
            sys.stdout.flush()
        except:
            end = time.time()
            print "PERFORMANCE\t%d\t%s\t%d\t%d\t%.2f\t%.15f\t%d\t%d\t%d\t%d" % (
                test_data_spec, test_path, test_run, piston_param,
                density_param, 0, 0, 0, 0, round(end - start))
コード例 #44
0
class MLCms:
    """

    """
    def __init__(self, config_file=''):
        # Parse config file
        self.parser = SafeConfigParser()
        self.parser.read(config_file)

        # machine learning specific variables
        self.classify = constants.DO_CLASSIFICATION  # Regress or classify?
        self.vars_features = constants.fixed_vars
        self.vars_target = constants.ML_TARGETS

        if self.classify:
            self.var_target = constants.ML_TARGETS
            self.task = 'classification'
            self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)
        else:
            self.var_target = constants.ML_TARGETS
            self.task = 'regression'
            self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)  # SVR()

        # Get path to input
        self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl

        # Output directory is <dir>_<classification>_<2014>
        self.path_out_dir = constants.out_dir
        utils.make_dir_if_missing(self.path_out_dir)

        # Model pickle
        self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle
        self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'

    def output_model_importance(self, gs, name_gs, num_cols):
        """

        :param gs:
        :param name_gs:
        :param num_cols:
        :return:
        """
        rows_list = []
        name_vars = []

        feature_importance = gs.best_estimator_.named_steps[name_gs].feature_importances_
        importances = 100.0 * (feature_importance / feature_importance.max())

        std = np.std([tree.feature_importances_ for tree in self.model.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        # Store feature ranking in a dataframe
        for f in range(num_cols):
            dict_results = {'Variable': self.vars_features[indices[f]], 'Importance': importances[indices[f]]}
            name_vars.append(self.vars_features[indices[f]])
            rows_list.append(dict_results)

        df_results = pd.DataFrame(rows_list)
        num_cols = 10 if len(indices) > 10 else len(indices)  # Plot upto a maximum of 10 features
        plot.plot_model_importance(num_bars=num_cols, xvals=importances[indices][:num_cols],
                                   std=std[indices][:num_cols], fname=self.task + '_importance_' + self.crop,
                                   title='Importance of variable (' + self.country + ' ' + self.crop_lname + ')',
                                   xlabel=name_vars[:num_cols], out_path=self.path_out_dir)

        df_results.to_csv(self.path_out_dir + os.sep + self.task + '_importance_' + self.crop + '.csv')

    def get_data(self):
        """

        :return:
        """
        df = pd.read_csv(self.path_inp)
        cols = [col for col in df.columns if col not in self.vars_features]
        # cols.extend(['DI', 'PI'])

        # Add information on PI and DI of soils
        # iterate over each row, get lat and lon
        # Find corresponding DI and PI

        lat_lons = zip(df['Long_round'], df['Lat_round'])
        vals_di = []
        vals_pi = []
        # for idx, (lon, lat) in enumerate(lat_lons):
        #     print idx, len(lat_lons)
        #     vals_pi.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\PI.tif',
        #                                            lon, lat, replace_ras=False))
        #     vals_di.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\DI.tif',
        #                                      lon, lat, replace_ras=False))
        #
        # df['DI'] = vals_di
        # df['PI'] = vals_pi
        df = df[cols]

        data = df.as_matrix(columns=cols[1:])
        target = df.as_matrix(columns=[self.var_target]).ravel()
        # Get training and testing splits
        splits = train_test_split(data, target, test_size=0.2)

        return cols, splits

    def train_ml_model(self):
        """

        :return:
        """
        logger.info('#########################################################################')
        logger.info('train_ml_model')
        logger.info('#########################################################################')

        ######################################################
        # Load dataset
        ######################################################
        cols, splits = self.get_data()
        data_train, data_test, target_train, target_test = splits

        # clf =  ExtraTreesRegressor(500, n_jobs=constants.ncpu)
        # #clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
        # #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
        # data = df_train.as_matrix(columns=cols[1:])  # convert dataframe column to matrix
        # #data = preprocessing.scale(data)
        # target = df_train.as_matrix(columns=[self.var_target]).ravel()  # convert dataframe column to matrix
        # clf.fit(data, target)
        #
        # predict_val = clf.predict(after.as_matrix(columns=cols[1:]))
        # results = compute_stats.ols(predict_val.tolist(), after_target.tolist())
        # print results.rsquared
        # import matplotlib.pyplot as plt
        # plt.scatter(after_target, predict_val)
        # plt.show()
        # pdb.set_trace()
        if not os.path.isfile(self.path_pickle_model):
            # For details in scikit workflow: See http://stackoverflow.com/questions/
            # 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea
            # TODO Separate out a dataset so that even the grid search cv can be tested
            ############################
            # Select features from model
            ############################
            logger.info('Selecting important features from model')
            if self.classify:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            else:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            feat_selection = SelectFromModel(rf_feature_imp)

            pipeline = Pipeline([
                      ('fs', feat_selection),
                      ('clf', self.model),
                    ])

            #################################
            # Grid search for best parameters
            #################################
            C_range = np.logspace(-2, 10, 13)
            gamma_range = np.logspace(-9, 3, 13)
            logger.info('Tuning hyperparameters')
            param_grid = {
                'fs__threshold': ['mean', 'median'],
                'fs__estimator__max_features': ['auto', 'log2'],
                'clf__max_features': ['auto', 'log2'],
                'clf__n_estimators': [1000, 2000]
                #'clf__gamma': np.logspace(-9, 3, 13),
                #'clf__C': np.logspace(-2, 10, 13)
            }

            gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan)
            # Fir the data before getting the best parameter combination. Different data sets will have
            # different optimized parameter combinations, i.e. without data, there is no optimal parameter combination.
            gs.fit(data_train, target_train)
            logger.info(gs.best_params_)

            data_test = pd.DataFrame(data_test, columns=cols[1:])

            # Update features that should be used in model
            selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]])
            cols = selected_features[0]
            data_test = data_test[cols]

            # Update model with the best parameters learnt in the previous step
            self.model = gs.best_estimator_.named_steps['clf']

            predict_val = self.model.predict(data_test)
            results = compute_stats.ols(predict_val.tolist(), target_test.tolist())
            print results.rsquared
            print cols
            plt.scatter(target_test, predict_val)
            plt.show()
            pdb.set_trace()
            ###################################################################
            # Output and plot importance of model features, and learning curves
            ###################################################################
            self.output_model_importance(gs, 'clf', num_cols=len(cols[1:]))

            if constants.plot_model_importance:
                train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold,
                                                                        n_jobs=constants.ncpu)
                plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve',
                                         ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir)

            # Save the model to disk
            logger.info('Saving model and features as pickle on disk')
            with open(self.path_pickle_model, 'wb') as f:
                cPickle.dump(self.model, f)
            with open(self.path_pickle_features, 'wb') as f:
                cPickle.dump(self.vars_features, f)
        else:
            # Read model from pickle on disk
            with open(self.path_pickle_model, 'rb') as f:
                logger.info('Reading model from pickle on disk')
                self.model = cPickle.load(f)

            logger.info('Reading features from pickle on disk')
            self.vars_features = pd.read_pickle(self.path_pickle_features)

        return df_cc

    def do_forecasting(self, df_forecast, mon_names, available_target=False, name_target='yield'):
        """
        1. Does classification/regression based on already built model.
        2. Plots confusion matrix for classification tasks, scatter plot for regression
        3. Plots accuracy statistics for classification/regression
        :param df_forecast:
        :param mon_names:
        :param available_target: Is target array available?
        :param name_target: Name of target array (defaults to yield)
        :return:
        """
        data = df_forecast.as_matrix(columns=self.vars_features)  # convert dataframe column to matrix
        predicted = self.model.predict(data)

        if available_target:
            expected = df_forecast.as_matrix(columns=[name_target]).ravel()
            if not self.classify:  # REGRESSION
                # Compute stats
                results = compute_stats.ols(predicted.tolist(), expected.tolist())
                bias = compute_stats.bias(predicted, expected)
                rmse = compute_stats.rmse(predicted, expected)
                mae = compute_stats.mae(predicted, expected)

                # Plot!
                plot.plot_regression_scatter(expected, np.asarray(predicted),
                                             annotate=r'$r^{2}$ ' + '{:0.2f}'.format(results.rsquared) + '\n' +
                                             'peak NDVI date: ' + self.time_peak_ndvi.strftime('%b %d'),
                                             xlabel='Expected yield',
                                             ylabel='Predicted yield',
                                             title=mon_names + ' ' + str(int(df_forecast[self.season].unique()[0])),
                                             fname=self.task + '_' + '_'.join([mon_names]) + '_' + self.crop,
                                             out_path=self.path_out_dir)

                # global expected vs predicted
                if self.debug:
                    # any non-existing index will add row
                    self.df_global.loc[len(self.df_global)] = [np.nanmean(expected), np.nanmean(predicted), mon_names,
                                                               self.forecast_yr]

                return predicted, {'RMSE': rmse, 'MAE': mae, r'$r^{2}$': results.rsquared, 'Bias': bias}
            else:  # CLASSIFICATION
                # Convert from crop condition class (e.g. 4) to string (e.g. exceptional)
                expected, predicted = compute_stats.remove_nans(expected, predicted)
                cm = confusion_matrix(expected, predicted, labels=self.dict_cc.keys()).T

                # Compute and plot class probabilities
                proba_cc = self.model.predict_proba(data)
                df_proba = pd.DataFrame(proba_cc, columns=self.dict_cc.values())
                plot.plot_class_probabilities(df_proba, fname='proba_' + '_'.join([mon_names]) + '_' + self.crop,
                                              out_path=self.path_out_dir)

                # Plot confusion matrix
                plot.plot_confusion_matrix(cm, normalized=False, fname='cm_' + '_'.join([mon_names]) + '_' + self.crop,
                                           xlabel='True class', ylabel='Predicted class', ticks=self.dict_cc.values(),
                                           out_path=self.path_out_dir)

                # Normalize and plot confusion matrix
                cm_normalized = normalize(cm.astype(float), axis=1, norm='l1')
                plot.plot_confusion_matrix(cm_normalized, fname='norm_cm_' + '_'.join([mon_names]) + '_' + self.crop,
                                           xlabel='True class', ylabel='Predicted class', normalized=True,
                                           ticks=self.dict_cc.values(), out_path=self.path_out_dir)

                score_accuracy = accuracy_score(expected, predicted) * 100.0
                score_precision = precision_score(expected, predicted, average='weighted') * 100.0
                return predicted, {'Accuracy': score_accuracy, 'Precision': score_precision}
        else:
            return predicted, {'RMSE': np.nan, 'MAE': np.nan, r'$r^{2}$': np.nan, 'Bias': np.nan,
                               'Nash-Sutcliff': np.nan}
コード例 #45
0
    y_pre_adb = adb.predict(x_test)

    #print(Evaluation([y_pre],[y_test]))

    RigeLinearCV = linear_model.RidgeCV(cv=3)
    clf1 = RigeLinearCV.fit(x_train, y_train)
    y_pre_cv = clf1.predict(x_test)
    #uni_pret.append(0.8*np.array(y_pre_adb)+0.2*np.array(y_pre_cv))
    #uni_pre.append(y_pre_adb)

    ###
    params_rf = {
        'n_estimators': 800,
        'min_samples_split': 1,
        'warm_start': True,
        'n_jobs': 2
    }
    rf = RandomForestRegressor(**params_rf)
    rf.fit(x_train, y_train)
    y_pre_rf = rf.predict(x_test)
    ###
    uni_pre = 0.4 * np.array(y_pre_adb) + 0.1 * np.array(
        y_pre_cv) + 0.5 * np.array(y_pre_rf)
    output(fw, i + 1, uni_pre)
    print(i)
    i += 1

fr1.close()
fr2.close()
fw.close()
コード例 #46
0
 def predict_proba(self, X):
     pred = RandomForestRegressor.predict(self, X)
     result = numpy.zeros([len(X), 2])
     result[:, 1] = special.expit(pred / 1000.)
     result[:, 0] = 1. - result[:, 1]
     return result
コード例 #47
0
class ItemSetModel(object):
    """docstring for ItemSetModel"""

    clf = None

    MODEL_PATH = os.path.join(settings.BASE_DIR, 'set_analyzer', 'analysis', 'models')
    CACHE_FILE = os.path.join(MODEL_PATH, 'model_cache.cache')

    def __init__(self):
        super(ItemSetModel, self).__init__()
        #self.clf = DecisionTreeRegressor()
        #self.clf = Lasso(0.1)
        #self.clf = SVR(kernel='rbf')
        #self.clf = ElasticNetCV()
        self.clf = RandomForestRegressor(max_depth=7, n_estimators=10)


    def get_data_sets(self, num_matches, cache=False, **kwargs):
        """
        Data Schema:
            Input:
                1    My champion ID
                6    My Champion's class info
                6    [Other team's cumulative class info]
                7    [7 Final Items]
                5    [first 5 items purchased]
                ________________________________________
                25 features

            Output:
                Score = A(Gold/time) + B(xp/time) + C(win)
                ________________________________________
                1 Output

        """

        #Presize data
        features = 25
        num_participants = num_matches*10
        input_data = np.zeros((num_participants, features))
        output_data = np.zeros(num_participants)

        row_num = 0

        get_champ_id = lambda x : x.champion.champion_id
        diff_team = lambda x , y : x.team_id != y.team_id
        item_purchased = lambda x: x.event_type == "ITEM_PURCHASED"

        #Iterate over every match in the database
        for match in Match.objects(**kwargs)[:num_matches]:

            #Prepare users and teams
            team_map = {}
            team_data = np.zeros((2,6))     #Store the sum of each team's tags
            count = 0
            for tag in match.teams:
                team_map[int(tag)] = count
                count+=1

            #Prepare champion class data
            for p in match.participants.values():
                for tag in p.champion.tags:
                    team_data[team_map[p.team_id], :] += np.array(p.champion.class_data)

            #Iterate over every user in the match
            for pid, participant in match.participants.items():
                col_num = 0

                #My Champion's info
                input_data[row_num][col_num] = get_champ_id(participant)
                col_num+=1
                input_data[row_num][col_num:col_num+6] = np.array(participant.champion.class_data)
                col_num+=6

                #Other Team's champion attributes
                if(team_map[participant.team_id] == 0):
                    input_data[row_num][col_num:col_num+6] = team_data[1,:]
                else:
                    input_data[row_num][col_num:col_num+6] = team_data[0,:]
                col_num+=6

                #My items
                for item_id in participant.final_build:
                    input_data[row_num][col_num] = item_id
                    col_num+=1

                #My Item purchases
                count = 0
                for item_purchase in (x for x in participant.item_events if item_purchased(x)):
                    if(count==5):
                        break
                    input_data[row_num][col_num] = item_purchase.payload['itemId']
                    col_num += 1
                    count += 1

                #Score
                #   Assume that average gold/sec is ~8
                #   Assume that average kda is ~2.6
                #   Have a game win worth some bonus
                score = participant.kda()*3 + participant.gold_earned/match.duration +  (4 if match.teams[str(participant.team_id)].won else 0)
                output_data[row_num] = score

                row_num+=1

        if(cache):
            print('Caching data...')
            self.cache_data((input_data, output_data))

        return (input_data, output_data)

    def cache_data(self, data):
        with open(self.CACHE_FILE, 'wb') as f:
            pickle.dump(data, f)

    def get_cached_data(self, num_rows):
        with open(self.CACHE_FILE, 'rb') as f:
            return pickle.load(f)[:num_rows]

    def train(self, X, Y, train_ratio=1, **kwargs):

        print("Training model...")
        if(train_ratio==1):
            print("Using {} rows".format(len(X)))
            self.clf.fit(X,Y)
        else:
            n = len(X)
            tn = int(n*train_ratio)
            print("Using {} rows".format(tn))
            self.clf.fit(X[:tn,:],Y[:tn])
            print("Evaluating model...")
            evaluate_fit(self.clf, X[tn:,:],Y[tn:])

    def predict(self, X):
        return self.clf.predict(X)

    #MODEL EVAUATION
    def k_fold(self, folds, **kwargs):
        X, Y = self.get_data_sets(**kwargs)
        k_fold_evaluate(self.clf, X, y, folds)

    #LOAD AND SAVE
    def save(self, filename):
        dirname = os.path.join(self.MODEL_PATH, filename)
        if(not os.path.exists(dirname)):
            os.makedirs(dirname)
        else:       #Empty folder
            for file in os.listdir(dirname):
                file_path = os.path.join(dirname, file)
                if os.path.isfile(file_path):
                    os.unlink(file_path)

        path = os.path.join(dirname, "{}.pkl".format(filename))
        joblib.dump(self.clf, path)

    def load(self, filename):
        path = os.path.join(self.MODEL_PATH, filename, "{}.pkl".format(filename))
        self.clf = joblib.load(path)
コード例 #48
0
# Initiate the monthly trade object
monthData = trade_model.monthlyModel(1, 2009, 6, 2013, 6, 2012, 6, 2013)
# Download data from Yahoo finance
monthData.monthlyDataDownload()
# Pre-processing of training an testing data
monthData.trainFeaturePre()
# Read pre-processed data from hard drive
# monthData.trainFeaturePreHd()
# Number of training months
trainSpan = len(monthData.xTrain[:,0,0]) - monthData.testSpan
# Initiate a random forest regressor
clf = RandomForestRegressor(n_estimators=10)
#
totalReturn = 1
predictedReturn = np.zeros(monthData.stockNum)
monthlyReturn = np.zeros(monthData.testSpan)
aggReturn = np.zeros(monthData.testSpan+1)
aggReturn[0] = 1
# rolling training and testing
for j in range(0, monthData.testSpan):
    for i in range(0, monthData.stockNum):
        clf.fit(monthData.xTrain[j:trainSpan+j, :, i], monthData.yTrain[j:trainSpan+j, 0, i])
        predictedReturn[i] = clf.predict(monthData.xTest[j, :, i])
    monthlyReturn[j] = monthData.por10Returns(j, predictedReturn)
    yearReturn = totalReturn * (monthlyReturn[j]+1)
    aggReturn[j+1] = aggReturn[j]*(1+monthlyReturn[j])

print monthlyReturn
print 'overall:', totalReturn
sp.portfolioVSspy(6, 2012, 6, 2013, aggReturn[1:])
コード例 #49
0
        for j in range(0, 15):
            s = s + '' + str(valoresVol[i + j]) + ','
            d.append(valoresVol[i + j])
        #maxValores = max(valoresCopiar[longValores:i+j]) # Esta cambia porque no debemos tener los valores
        #maxValores = max(valoresCopiar[i+j:i+j+5])
        maxValores = max(valoresCopiar[i + 14 + longValoresTest:i +
                                       longValoresTest + 14 + 5])
        #s = s + str(150)
        #d.append(150)
        X_test.append(d)
        y_test.append(maxValores)

    # Entrenamos
    regr = RandomForestRegressor()
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    regr2 = SVR(kernel='rbf', C=1e3, gamma=0.1)
    regr2.fit(X_train, y_train)
    y_pred2 = regr2.predict(X_test)

    regr3 = linear_model.LinearRegression()
    regr3.fit(X_train, y_train)
    y_pred3 = regr3.predict(X_test)

    # Votacion
    # Si todos OK => Se invierte
    votacion = 0
    max_pred_array = [max(y_pred), max(y_pred2), max(y_pred3)]
    for i_voto in max_pred_array:
        ganancia_pred = (i_voto - cierreEvaluar) / cierreEvaluar
for i in range(10):
    X, y = shuffle(boston.data, boston.target)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140)
    regressor2 = DecisionTreeRegressor(max_depth=6)
    regressor3 = LinearRegression()
    regressor4 = RandomForestRegressor()
    regressor.fit(X_train, y_train)
    regressor2.fit(X_train, y_train)
    regressor3.fit(X_train, y_train)
    regressor4.fit(X_train, y_train)
    y_pred = regressor.predict(x)
    y_pred2 = regressor2.predict(x)
    y_pred3 = regressor3.predict(x)
    y_pred4 = regressor4.predict(x)
    predictions.append(y_pred)
    predictions2.append(y_pred2)
    predictions3.append(y_pred3)
    predictions4.append(y_pred4)
    print "\nPrediction = " + str(y_pred)
    print "Prediction = " + str(y_pred2)
    print "Prediction = " + str(y_pred3)
    print "Prediction = " + str(y_pred4)

print '\n'
print 'Boosting max', np.max(predictions), 'min', np.min(predictions), 'variance', np.max(predictions) - np.min(predictions)
print 'Decision tree max', np.max(predictions2), 'min', np.min(predictions2), 'variance', np.max(predictions2) - np.min(predictions2)
print 'Random forest max', np.max(predictions4), 'min', np.min(predictions4), 'variance', np.max(predictions4) - np.min(predictions4)
print 'Linear regression max', np.max(predictions3), 'min', np.min(predictions3), 'variance', np.max(predictions3) - np.min(predictions3)
コード例 #51
0
ファイル: ex15.py プロジェクト: gabormakrai/landuseregression
for v in timestampDoubleData:
    timestampData2.append(str(int(v)))

# modelling
for location in locations:

    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, featureTW, "target", timestampData)
    print("\tT+W (on data without ATC) #train: " + str(len(trainY)) +
          ", #test:" + str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    rmse = rmseEval(testY, prediction)[1]
    print("\trmse: " + str(rmse))
    for i in range(0, len(testY)):
        timestamp = testTimestamp[i]
        value = prediction[i]
        TWpredictionData[str(location)][timestamp] = value

    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data2, featureTWAtc, "target", timestampData2)
    print("\tT+W+Atc #train: " + str(len(trainY)) + ", #test:" +
          str(len(testY)))
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
コード例 #52
0
for i in range(10):
    X, y = shuffle(boston.data, boston.target)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140)
    regressor2 = DecisionTreeRegressor(max_depth=6)
    regressor3 = LinearRegression()
    regressor4 = RandomForestRegressor()
    regressor.fit(X_train, y_train)
    regressor2.fit(X_train, y_train)
    regressor3.fit(X_train, y_train)
    regressor4.fit(X_train, y_train)
    y_pred = regressor.predict(x)
    y_pred2 = regressor2.predict(x)
    y_pred3 = regressor3.predict(x)
    y_pred4 = regressor4.predict(x)
    predictions.append(y_pred)
    predictions2.append(y_pred2)
    predictions3.append(y_pred3)
    predictions4.append(y_pred4)
    print "\nPrediction = " + str(y_pred)
    print "Prediction = " + str(y_pred2)
    print "Prediction = " + str(y_pred3)
    print "Prediction = " + str(y_pred4)

print '\n'
print 'Boosting max', np.max(predictions), 'min', np.min(
    predictions), 'variance', np.max(predictions) - np.min(predictions)
print 'Decision tree max', np.max(predictions2), 'min', np.min(
    predictions2), 'variance', np.max(predictions2) - np.min(predictions2)
print 'Random forest max', np.max(predictions4), 'min', np.min(
コード例 #53
0
                                                       feature_index,
                                                       target_index)

    clf_adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8), n_estimators=50,
                                                           loss='linear', random_state=0)
    clf_extra_trees = ExtraTreesRegressor(n_estimators=8, random_state=0, max_depth=30)
    clf_random_forest = RandomForestRegressor(n_estimators=8, random_state=0, max_depth=30)

    clf_adaboost.fit(all_data_test.T, all_targets_test[0])
    predicted = clf_adaboost.predict(all_data_valid.T)

    clf_extra_trees.fit(all_data_test.T, all_targets_test[0])
    predicted_extra = clf_extra_trees.predict(all_data_valid.T)

    clf_random_forest.fit(all_data_test.T, all_targets_test[0])
    predicted_forest = clf_random_forest.predict(all_data_valid.T)


    delta_ada = all_targets_valid[0] - predicted
    delta_extra = all_targets_valid[0] - predicted_extra
    delta_forest = all_targets_valid[0] - predicted_forest
    std_ada = get_standart_deviation(delta_ada)
    std_extra = get_standart_deviation(delta_extra)
    std_forest = get_standart_deviation(delta_forest)

    plt.hist(delta_ada, bins=150, color='g', label='Adaboost '+str(np.round(std_ada,4)))
    plt.hist(delta_extra, bins=150, color='b', label='Extra_Trees '+str(np.round(std_extra,4)))
    plt.hist(delta_forest, bins=150, color='r', label='Random_Forest '+str(np.round(std_forest,4)))
    title = "Compare adaboost, extra_tree and Random_Forests"
    plt.title(title)
    plt.legend(loc='upper left')
コード例 #54
0
    
    #test = test.join(pd.DataFrame(test.Date.apply(splitTime).tolist(), columns = ['year','mon','day']))
    #newtest = test.drop('StateHoliday',1).join(pd.get_dummies(test['StateHoliday']).rename(columns=lambda x: 'StateHoliday' +"_"+str(x)))  
    #newtest = pd.merge(newtest,store, on="Store")
    #newtest.drop(['Date'],axis = 1,inplace=True) 
    
    #assert(np.sum(newtrain.var()==0)==0)
    #
    #toDrop = list(set(newtrain.columns.values)-set(newtest.columns.values) )
    features = [col for col in newtrain.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes']]
    #
    rf = RandomForestRegressor(n_estimators=100)
    print('Starting training...')
    rf.fit(newtrain[features].fillna(-1),newtrain.LogSale)
    print('Predicting train values...')
    newtrain['mypred'] = rf.predict(newtrain[features].fillna(-1))
    newtrain['mypred'] = np.exp(newtrain['mypred'])-1
    train_error = rmspe(newtrain[newtrain.Sales>0].Sales,newtrain[newtrain.Sales>0].mypred)
    print('train set error',train_error)
    newtest['mypred'] = rf.predict(newtest[features].fillna(-1))
    newtest['mypred'] = np.exp(newtest['mypred'])-1
    test_error = rmspe(newtest[newtest.Sales>0].Sales,newtest[newtest.Sales>0].mypred)
    print('test set error',test_error)
    train_results.append(train_error)
    test_results.append(test_error)

print('mean train error', np.mean(train_results))
print('mean test error',np.mean(test_results))