def compare_change(data): """ Run multiple models: fit on training data, predict for tournament data. Then change the data, rerun and compare performance with and without the change. """ print('\nStandard dataset:\n') # we'll look at 5 models prediction = nx.production(nx.logistic(), data, verbosity=1) prediction += nx.production(nx.extratrees(), data, verbosity=1) prediction += nx.production(nx.randomforest(), data, verbosity=1) prediction += nx.production(nx.mlpc(), data, verbosity=1) prediction += nx.production(nx.logisticPCA(), data, verbosity=1) # let's now make a change, could be anything; as an example let's add # the square of each feature to the dataset x = np.hstack((data.x, data.x * data.x)) data2 = data.xnew(x) print('\nDataset expanded with squared features:\n') # rerun all models with the new expanded data prediction2 = nx.production(nx.logistic(), data2, verbosity=1) prediction2 += nx.production(nx.extratrees(), data2, verbosity=1) prediction2 += nx.production(nx.randomforest(), data2, verbosity=1) prediction2 += nx.production(nx.mlpc(), data2, verbosity=1) prediction2 += nx.production(nx.logisticPCA(), data2, verbosity=1) # compare performance print('\nCompare (1 is regular dataset; 2 expanded dataset):\n') print(prediction.compare(data['validation'], prediction2))
def improve_model(data, tournament='kazutsugi'): """ Run multiple models: fit on training data, predict for tournament data. Then change the data, rerun and compare performance with and without the change. """ # we'll look at 5 models models = [nx.linear(), nx.extratrees(), nx.randomforest(), nx.mlpc(), nx.linearPCA()] print('\nStandard dataset:\n') # first run the base case prediction = nx.production(models, data, tournament, verbosity=1) # let's now make a change, could be anything; as an example let's add # the square of each feature to the dataset x = np.hstack((data.x, data.x * data.x)) data2 = data.xnew(x) print('\nDataset expanded with squared features:\n') # rerun all models with the new expanded data prediction2 = nx.production(models, data2, tournament, verbosity=1) # compare performance print('\nCompare (1 is regular dataset; 2 expanded dataset):\n') print(prediction.compare(data['validation'], prediction2, tournament))
def concordance(data, tournament='bernie'): """ Example showing how to calculate concordance. Concordance must be less than 0.12 to pass numerai's check. For an accurate concordance calculation `data` must be the full dataset. """ models = [nx.logistic(), nx.extratrees(), nx.mlpc()] p = nx.production(models, data, tournament) print("\nA concordance less than 0.12 is passing") print(p.concordance(data))
def concordance_example(data): """ Example showing how to calculate concordance. Concordance must be less than 0.12 to pass numerai's check. For an accurate concordance calculation `data` must be the full dataset. """ prediction = nx.production(nx.logistic(), data) prediction += nx.production(nx.extratrees(), data) prediction += nx.production(nx.mlpc(), data) print("\nA concordance less than 0.12 is passing") print(prediction.concordance(data))
def get_models(): models = [ nx.logistic(), nx.extratrees(), nx.randomforest(), nx.mlpc(), nx.logisticPCA(), nx.example_predictions(), fifty() ] return models
def get_models(): models = [nx.linear(), nx.ridge_mean(), nx.extratrees(), nx.randomforest(), nx.mlpc(), nx.linearPCA(), nx.example_predictions(), nx.fifty()] return models
def runner_example(): data = nx.play_data() splitter = nx.CVSplitter(data) # let's run 3 models m1 = {'model': nx.logistic(), 'prediction_file': None, 'csv_file': None} m2 = {'model': nx.logistic(1e-4)} m3 = {'model': nx.extratrees()} run_list = [m1, m2, m3] # we won't save anything, just display the results runner = nx.Runner(run_list, splitter, verbosity=1) runner.run()
def compare_models(data): """ Run multiple models: fit on training data, predict for tournament data. Then compare performance of the models """ # we'll look at 5 models prediction = nx.production(nx.logistic(), data, verbosity=1) prediction += nx.production(nx.extratrees(), data, verbosity=1) prediction += nx.production(nx.randomforest(), data, verbosity=1) prediction += nx.production(nx.mlpc(), data, verbosity=1) prediction += nx.production(nx.logisticPCA(), data, verbosity=1) # correlation of models with logistic regression print('\nCorrelation:\n') prediction.correlation('logistic') # compare performance of models print('\nPerformance comparison:\n') print(prediction.performance(data['validation'], sort_by='logloss')) # dominance of models print('\nModel dominance:\n') print(prediction.dominance(data['validation'], sort_by='logloss')) # dominace between two models print('\nModel dominance between two models:\n') df = prediction[['logistic', 'logisticPCA']].dominance(data['validation']) print(df) # originality given that logistic model has already been submitted print('\nModel originality (versus logistic):\n') print(prediction.originality(['logistic'])) # concordance print('\nConcordance:\n') print(prediction.concordance(data))
def get_models(): models = [nx.logistic(), nx.extratrees(), nx.randomforest()] if HAS_XGBOOST: models.append(nx.xgboost()) return models