def main():

    # Get optimal sklearn Pipeline
    #pipe, parameters = getPipeline()
    # Create gridSearchRegressor from optimal Pipeline
    #gridSearchRegressor = GridSearchCV(pipe,parameters,n_jobs=3, verbose=1, scoring='r2')

    path, target = dataut.getVars()

    # Load Test Set
    X_test = pd.read_csv(path + '/data/test.csv')
    # Load original merged set
    data = dataut.getData(path)

    loaded_model = joblib.load(path + '/meta/finalized_model.sav')

    constant = 'Recipe_code'
    dependentVar = 'score'
    X_test['priceXing_count'] = X_test['price'] * X_test['ingredients_count']
    #X_test = X_test.drop([dependentVar,constant],1)

    # Predict Score on Test Set
    #predictions=gridSearchRegressor.predict(X_test)
    predictions = loaded_model.predict(X_test)
    result_df = data.loc[data.Recipe_code.isin(X_test.Recipe_code)]
    # Create new column in Test dataframe
    result_df['predicted_score'] = predictions
    # Save the submission dataframe with the new column
    submission_df = result_df[['Recipe_code', 'predicted_score']]
    submission_df.to_csv(path + '/out/predict_output.csv')
def getPipeline():

    path, target = dataut.getVars()
    X = dataut.preProcessData(path)

    return Pipeline([('fadd', FeatureAdd()),
                     ('gbr', GradientBoostingRegressor())])
def main():

    # Get Pipeline components
    pipeline = modelut.getPipeline()

    # Get parameter options for Pipeline components
    parameters = modelut.getParameters()

    # Get best set of parameters and evaluate validation set accuracy
    bestParameters = getBestParameters(pipeline, parameters)

    path, dependentVar = dataut.getVars()

    # Save best parameter set
    res = open(path + "/meta/best_params_model.txt", 'w')
    res.write('best parameters set:\n')
    for paramName in sorted(parameters.keys()):
        res.write('\t %s: %r\n' % (paramName, bestParameters[paramName]))

    joblib.dump(bestParameters, open(path + "/meta/bestParams.pkl", "wb"))
def getPipeline():

    path, dependentVar = dataut.getVars()
    # Load best set of parameters
    bestParameters = joblib.load(open(path + "/meta/bestParams.pkl", "rb"))
    # Create sklearn Pipeline
    pipe = Pipeline([('fadd', modelut.FeatureAdd()),
                     ('gbr',
                      GradientBoostingRegressor({
                          "n_estimators":
                          bestParameters['gbr__n_estimators'],
                          "max_features":
                          bestParameters['gbr__max_features'],
                          "max_depth":
                          bestParameters['gbr__max_depth'],
                          "learning_rate":
                          bestParameters['gbr__learning_rate']
                      }))])
    # We create this empty dict as it is required for the syntax of GridSearchCV
    parameters = {}
    # Return sklearn Pipeline and empty dict
    return pipe, parameters
def getBestParameters(pipeline, parameters):

    path, dependentVar = dataut.getVars()

    X = dataut.preProcessData(path)

    y = X.loc[:, dependentVar]

    # create and fit a GBR model
    grid = dataut.GridSearchCV(pipeline, parameters)
    grid.fit(X, y)

    # summarize the results of the grid search
    print(grid.best_score_)
    print(grid.best_estimator_.get_params())
    bestParameters = grid.best_estimator_.get_params()

    #persist to disk
    filename = path + '/meta/finalized_model.sav'
    joblib.dump(grid, filename)

    # Display best set of parameters
    #print ('best parameters set:')
    #for paramName in sorted(parameters.keys()):
    #    print ('\t %s: %r' % (paramName, bestParameters[paramName]))

    # Evaluate pegbrormance of gridSearchRegressor on Validation Set
    X_valid = pd.read_csv(path + '/data/validation.csv')
    y_valid = X_valid.loc[:, dependentVar]

    constant = 'Recipe_code'
    dependentVar = 'score'
    X_valid[
        'priceXing_count'] = X_valid['price'] * X_valid['ingredients_count']
    X_valid = X_valid.drop([dependentVar, constant], 1)

    # Make predictions on validation set and calculate best set of parameters
    bestParameters, predictions = feedut.validate(parameters, grid, X_valid,
                                                  y_valid)

    # Initialize DataFrame for feedback loop
    valdf = pd.DataFrame(index=X_valid.index.values)
    # Add ingredients column
    valdf = valdf.join(X_valid)
    # Add correct cuisine
    valdf["cuisine"] = y_valid
    # Add predictions column
    valdf["pred_cuisine"] = predictions
    # Add check column. This column would be false for incorrect predictions
    valdf["check"] = valdf.pred_cuisine == valdf.cuisine
    # Store DataFrame for feedback
    valdf.to_csv(path + "/out/feedback.csv")

    # Create joint DataFrame to incorporate feedback data. As of now, this will only have the ingredients and cuisine columns from the training set
    ultimateTraindf = pd.DataFrame(index=X.index.values)
    ultimateTraindf = ultimateTraindf.join(X)
    ultimateTraindf["cuisine"] = y
    # Calculate best set of parameters after retraining with feedback data. Make predictions on validation set
    bestParameters, predictions = feedut.feedback(pipeline, parameters,
                                                  ultimateTraindf)
    """validation_R2 = r2_score(y_valid,grid.fit(X_valid,y_valid).predict(X_valid))
    print("............................................")
    print("............................................")
    print("our validation set R2 score is %.2f%%"%(validation_R2*100))
    
    X_valid['pred_score'] = grid.predict(X_valid)
    X_valid['difference'] = X_valid['score']-X_valid['pred_score']
    X_valid = X_valid[['score','pred_score','difference']]
    X_valid.to_csv(path+'/out/validation_predict.csv')  """

    return bestParameters