Ejemplo n.º 1
0
def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]
        
        print("Location: " + str(location) + ", location2: " + str(location2s))
        
        # generating testPreds
        testPreds = {}
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target")
                
            model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
            model.fit(trainX, trainY)
            prediction = model.predict(testX)
            testPreds[tag] = prediction
          
        trainPreds = defaultdict(list)
          
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            print("\ttag: " + str(tag) + ", features: " + str(features))
            for location2 in location2s:
                trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target")
                model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
                model.fit(trainX1, trainY1)
                train1Prediction = model.predict(trainX1)
                train2Prediction = model.predict(trainX2)
                testPrediction = model.predict(testX)
                train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
                train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
                testRmse = str(rmseEval(testY, testPrediction)[1])
                print("\t\ttrain1 rmse: " + train1Rmse)
                print("\t\ttrain2 rmse: " + train2Rmse)
                print("\t\ttest rmse: " + testRmse)
                for x in train2Prediction:
                    trainPreds[tag].append(x)

        # get combined train2y                
        combinedTrain2Y = []        
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            combinedTrain2Y = combinedTrain2Y + trainY2
          
        # calculate labels 
        labelTrain2Y = []
        for i in range(0, len(combinedTrain2Y)):
            bestModel = 0
            bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i])
            for j in range(0, len(topTags)):
                tag = topTags[j]
                modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestModel = j
            labelTrain2Y.append(bestModel)
            
        # generating testX
        _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target")

        # trainX2             
        tX2 = []
        for location2 in location2s:
            _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            for row in trainX2:
                tX2.append(row)
        
        for tag in topTags:
            for i in range(0, len(trainPreds[tag])):
                tX2[i].append(trainPreds[tag][i]) 
        
        reducedTrainX2 = []
        for d in tX2:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX2.append(reducedD)
              
        model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15)
        model.fit(reducedTrainX2, labelTrain2Y)
        
        for tag in topTags:
            for i in range(0, len(testPreds[tag])):
                testX[i].append(testPreds[tag][i]) 
        
        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)
         
        pred = model.predict(reducedTestX)
         
        finalPrediction = []
        for i in range(0, len(testY)):
            p = testPreds[topTags[pred[i]]][i]
            finalPrediction.append(p)      
        rmse = str(rmseEval(testY, finalPrediction)[1])
        print("\tRMSE: " + str(rmse))
        
        for x in testY:
            overallY.append(x)
        for x in finalPrediction:
            overallPred.append(x)
    
    rmse = rmseEval(overallPred, overallY)[1]
    return rmse
Ejemplo n.º 2
0
            break
 
for location in locations:
    location2 = [l for l in locations if l != location][0]
    
    print("Location: " + str(location) + ", location2: " + str(location2))
     
    trainPreds = {}
    testPreds = {}
    t2Y = None
    tY = None
     
    for datagroup in top10datagroups:
        tag, features = getTagAndFeatures(datagroup)
        print("\ttag: " + str(tag) + ", features: " + str(features))
        trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target")
        t2Y = trainY2
        tY = testY
        model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
        model.fit(trainX1, trainY1)
        train1Prediction = model.predict(trainX1)
        train2Prediction = model.predict(trainX2)
        testPrediction = model.predict(testX)
        train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
        train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
        testRmse = str(rmseEval(testY, testPrediction)[1])
        print("\t\ttrain1 rmse: " + train1Rmse)
        print("\t\ttrain2 rmse: " + train2Rmse)
        print("\t\ttest rmse: " + testRmse)
        trainPreds[tag] = train2Prediction
        testPreds[tag] = testPrediction