import numpy as np import cooking_util from sklearn import cross_validation from sklearn.linear_model import LogisticRegression train_set, test_set = cooking_util.load_data() train_set_c, test_set_c = cooking_util.clean_data_bow_mult(train_set, test_set) dm = cooking_util.Data_mapper() X_train, y_train = dm.make_train_arrays(train_set_c) clf = LogisticRegression() fold_scores = cross_validation.cross_val_score(clf, X_train, y_train, cv=5) # implements 5-fold cross validation mean_fold_score = fold_scores.mean() # average accuracy over the folds mean_fold_std = fold_scores.std() # standard deviation of accuracy over the folds print("Cross-val accuracy: %0.5f (+/- %0.5f)" % (mean_fold_score, mean_fold_std *2)) clf.fit(X_train, y_train) # fit to the entire training dataset print "writing submission file to sub_log_mult.csv" cooking_util.submit(clf.predict, test_set_c, dm, filename='sub_log_mult.csv') print "finished"
import numpy as np import cooking_util from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier train_set, test_set = cooking_util.load_data() train_set_c, test_set_c = cooking_util.clean_data_bow_mult(train_set, test_set, min_recipes=50) dm = cooking_util.Data_mapper() X_train, y_train = dm.make_train_arrays(train_set_c) forest = RandomForestClassifier(n_estimators=1000) fold_scores = cross_validation.cross_val_score(forest, X_train, y_train, cv=5) mean_fold_score = fold_scores.mean() mean_fold_std = fold_scores.std() print("Cross-val accuracy: %0.5f (+/- %0.5f)" % (mean_fold_score, mean_fold_std *2)) forest.fit(X_train, y_train) print "writing submission file to sub_rf_bow_mult_moar.csv" cooking_util.submit(forest.predict, test_set_c, dm, filename='sub_rf_bow_mult_moar.csv') print "finished"