scores_dict = {} folds = [fold_1, fold_2, fold_3, fold_4, fold_5] merged_folds = [] for i in range(len(folds)): folds_to_merge = [f for f in folds if f != folds[i]] mf = unionAll(*folds_to_merge) mf = mf.coalesce(200) mf = mf.persist(StorageLevel.MEMORY_AND_DISK) merged_folds.append({'train': mf, 'val': folds[i]}) for alpha in [0.01, 0.1, 1]: utils.printNowToFile('trying alpha = ' + str(alpha)) partial_scores = np.array([]) srrcv = rr.SparkRidgeRegression(reg_factor=alpha) for mf in merged_folds: srrcv.fit(mf['train'], features_column) result = srrcv.predict_many(mf['val'], features_column, 'target_predictions') partial_scores = np.append( partial_scores, srrcv.r2(result.select('PINCP', 'target_predictions'))) final_score = np.mean(partial_scores) scores_dict[alpha] = final_score for k in scores_dict: utils.printNowToFile('alpha ' + str(k) + ' - r2 score ' + str(scores_dict[k])) best_alpha = max(scores_dict, key=scores_dict.get)
final_columns = [target, 'features_std'] #Drop useless features utils.printNowToFile("dropping useless columns:") train_set = train_set.select(final_columns) test_set = test_set.select(final_columns) ############################################################### for features_column in [col for col in final_columns if col != target]: utils.printNowToFile("starting SparkRidgeRegression:") train_set = train_set.persist(StorageLevel.DISK_ONLY) utils.printNowToFile("pre srr fit:") srr = rr.SparkRidgeRegression(reg_factor=1) srr.fit(train_set, features_column) utils.printNowToFile("post srr fit:") result = srr.predict_many(test_set, features_column, 'target_predictions') utils.printToFile('result: {0}'.format( srr.r2(result.select('PINCP', 'target_predictions')))) utils.printNowToFile("starting linear transform:") lin_reg = LinearRegression(standardization=False, featuresCol=features_column, labelCol='PINCP', maxIter=10, regParam=1, elasticNetParam=0.0, fitIntercept=True)