Exemple #1
0
    scores_dict = {}
    folds = [fold_1, fold_2, fold_3, fold_4, fold_5]

    merged_folds = []
    for i in range(len(folds)):
        folds_to_merge = [f for f in folds if f != folds[i]]
        mf = unionAll(*folds_to_merge)
        mf = mf.coalesce(200)
        mf = mf.persist(StorageLevel.MEMORY_AND_DISK)
        merged_folds.append({'train': mf, 'val': folds[i]})

    for alpha in [0.01, 0.1, 1]:
        utils.printNowToFile('trying alpha = ' + str(alpha))
        partial_scores = np.array([])
        srrcv = rr.SparkRidgeRegression(reg_factor=alpha)
        for mf in merged_folds:
            srrcv.fit(mf['train'], features_column)
            result = srrcv.predict_many(mf['val'], features_column,
                                        'target_predictions')
            partial_scores = np.append(
                partial_scores,
                srrcv.r2(result.select('PINCP', 'target_predictions')))
        final_score = np.mean(partial_scores)
        scores_dict[alpha] = final_score

    for k in scores_dict:
        utils.printNowToFile('alpha ' + str(k) + ' - r2 score ' +
                             str(scores_dict[k]))

    best_alpha = max(scores_dict, key=scores_dict.get)
final_columns = [target, 'features_std']

#Drop useless features
utils.printNowToFile("dropping useless columns:")
train_set = train_set.select(final_columns)
test_set = test_set.select(final_columns)

###############################################################
for features_column in [col for col in final_columns if col != target]:

    utils.printNowToFile("starting SparkRidgeRegression:")

    train_set = train_set.persist(StorageLevel.DISK_ONLY)

    utils.printNowToFile("pre srr fit:")
    srr = rr.SparkRidgeRegression(reg_factor=1)
    srr.fit(train_set, features_column)
    utils.printNowToFile("post srr fit:")

    result = srr.predict_many(test_set, features_column, 'target_predictions')
    utils.printToFile('result: {0}'.format(
        srr.r2(result.select('PINCP', 'target_predictions'))))

    utils.printNowToFile("starting linear transform:")
    lin_reg = LinearRegression(standardization=False,
                               featuresCol=features_column,
                               labelCol='PINCP',
                               maxIter=10,
                               regParam=1,
                               elasticNetParam=0.0,
                               fitIntercept=True)