def main(): # df = data.read_visited_key_points('Fri', grouped=True, extra=['category']) # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] # df = df[df['category'].isin(categories)].sort_values('Timestamp') # # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last() # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first() categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] x, y, prev, ids = pp.get_bag_data(['Fri'], 11, categories, return_prev=True, return_ids=True) # discard the day data because we only have 1 day ids = ids['group_id'].values # clamp x values to 1 or 0 x = (x > 0).astype('int64') x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = ( cross_validation.train_test_split(x, y, prev, ids, train_size=0.25, random_state=2294967295) ) print('Predicting') predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) predictor.fit(x_train, y_train) y_pred = predictor.predict(x_test) print('Plotting') fig1, ax1 = plt.subplots() fig2, ax2 = plt.subplots() axs = [ax1, ax2] sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]] max_size = max(sizes) axs[0].set_title('Actual Data') plot_next_place(prev_test, y_test, ids_test, ax=axs[0]) axs[1].set_title('Predicted') plot_next_place(prev_test, y_pred, ids_test, ax=axs[1]) fig1.savefig('actual.png', tight=True) fig2.savefig('predicted.png', tight=True) plt.show()
import matplotlib matplotlib.use('Qt4Agg') import data import pandas as pd import matplotlib.pyplot as plt import script.predict.preprocess as pp import script.predict.predictors as pdt from sklearn.cross_validation import train_test_split training_size = 0.25 x, y = pp.get_bag_data(['Sat'], 12, pp.common_categories) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=training_size, random_state=2294967295) im = data.read_image(size=1000) for predictor_name in pdt.all_predictors: predictor = pdt.all_predictors[predictor_name] predictor.fit(x_train, y_train) y_pred = predictor.predict(x_test) y_pred_probs = predictor.predict_proba(x_test) kp = data.read_key_points().set_index('place_id') # Adjust the size of this so that it's proportional to the testing data kp['Training Counts'] = ((1 - training_size) / training_size) * pd.Series(y_train).value_counts() kp['Test Counts'] = pd.Series(y_test).value_counts() kp['Prediction Counts'] = pd.Series(y_pred).value_counts() kp['Prediction Probability Sum'] = pd.DataFrame(y_pred_probs, columns=predictor.classes_).sum() kp.fillna(0, inplace=True) # fig, axs = plt.subplots(2, 2) fig, axs = plt.subplots(1, 3) fig.suptitle(predictor_name)
def main(): # df = data.read_visited_key_points('Fri', grouped=True, extra=['category']) # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] # df = df[df['category'].isin(categories)].sort_values('Timestamp') # # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last() # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first() categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] x, y, prev, ids = pp.get_bag_data(['Fri'], 14, categories, return_prev=True, return_ids=True) # discard the day data because we only have 1 day ids = ids['group_id'].values # clamp x values to 1 or 0 x = (x > 0).astype('int64') x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = ( cross_validation.train_test_split(x, y, prev, ids, train_size=0.25, random_state=2294967295) ) print('Predicting') all_predictors = { # 'Decision Tree': # tree.DecisionTreeClassifier(), # 'Gradient Boosting': # ensemble.GradientBoostingClassifier(n_estimators=33, learning_rate=1.0, random_state=0), 'Random Forest': ensemble.RandomForestClassifier(max_depth=2), 'Adaboost': ensemble.AdaBoostClassifier(random_state=0), 'MultinomialNB': naive_bayes.MultinomialNB(), # 'GaussianNB': gnb_predict, 'BernoulliNB': naive_bayes.BernoulliNB(), # 'KNN': # neighbors.KNeighborsClassifier(n_neighbors=10), # 'Random': # dummy.DummyClassifier(strategy='stratified'), 'Most Frequent': dummy.DummyClassifier(strategy='most_frequent'), 'Uniform': dummy.DummyClassifier(strategy='uniform'), } # predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) # predictor = ensemble.AdaBoostClassifier(random_state=0) # predictor = naive_bayes.MultinomialNB() # predictor = naive_bayes.BernoulliNB() # predictor = dummy.DummyClassifier(strategy='most_frequent') predictor = neighbors.KNeighborsClassifier(n_neighbors=10) predictor.fit(x_train, y_train) y_pred = predictor.predict(x_test) print('Plotting') fig1, ax1 = plt.subplots() fig2, ax2 = plt.subplots() axs = [ax1, ax2] sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]] max_size = max(sizes) axs[0].set_title('Actual Data') plot_next_place(prev_test, y_test, ids_test, ax=axs[0]) axs[1].set_title('RF Predicted') plot_next_place(prev_test, y_pred, ids_test, ax=axs[1]) # fig1.savefig('actual.png', tight=True) # fig2.savefig('RF predicted.png', tight=True) plt.show()
dummy.DummyClassifier(strategy='stratified'), 'Most Frequent': dummy.DummyClassifier(strategy='most_frequent'), 'Uniform': dummy.DummyClassifier(strategy='uniform'), 'RNN': RNNclassifier, # 'DNN': # DNNclassifier } categories = [ 'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows and Entertainment', 'Shopping' ] x, y = pp.get_bag_data(['Sat'], 16, categories=categories) x = (x > 0).astype('int64') kp = data.read_key_points().set_index('place_id') kp = kp[kp['category'].isin(categories)] kp['category'] = kp['category'].astype('category') y = kp.loc[y, 'category'].cat.codes.values x_train, x_validate, y_train, y_validate = cross_validation.train_test_split( x, y, train_size=0.90, random_state=294967295) # y_train_cats = kp.loc[y_train, 'category'].cat.codes.values scorings = ['accuracy', 'log_loss'] names = [] scores = {}
def main(): # df = data.read_visited_key_points('Fri', grouped=True, extra=['category']) # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] # df = df[df['category'].isin(categories)].sort_values('Timestamp') # # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last() # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first() categories = [ 'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping' ] x, y, prev, ids = pp.get_bag_data(['Fri'], 14, categories, return_prev=True, return_ids=True) # discard the day data because we only have 1 day ids = ids['group_id'].values # clamp x values to 1 or 0 x = (x > 0).astype('int64') x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = ( cross_validation.train_test_split(x, y, prev, ids, train_size=0.25, random_state=2294967295)) print('Predicting') all_predictors = { # 'Decision Tree': # tree.DecisionTreeClassifier(), # 'Gradient Boosting': # ensemble.GradientBoostingClassifier(n_estimators=33, learning_rate=1.0, random_state=0), 'Random Forest': ensemble.RandomForestClassifier(max_depth=2), 'Adaboost': ensemble.AdaBoostClassifier(random_state=0), 'MultinomialNB': naive_bayes.MultinomialNB(), # 'GaussianNB': gnb_predict, 'BernoulliNB': naive_bayes.BernoulliNB(), # 'KNN': # neighbors.KNeighborsClassifier(n_neighbors=10), # 'Random': # dummy.DummyClassifier(strategy='stratified'), 'Most Frequent': dummy.DummyClassifier(strategy='most_frequent'), 'Uniform': dummy.DummyClassifier(strategy='uniform'), } # predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) # predictor = ensemble.AdaBoostClassifier(random_state=0) # predictor = naive_bayes.MultinomialNB() # predictor = naive_bayes.BernoulliNB() # predictor = dummy.DummyClassifier(strategy='most_frequent') predictor = neighbors.KNeighborsClassifier(n_neighbors=10) predictor.fit(x_train, y_train) y_pred = predictor.predict(x_test) print('Plotting') fig1, ax1 = plt.subplots() fig2, ax2 = plt.subplots() axs = [ax1, ax2] sizes = [ get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred] ] max_size = max(sizes) axs[0].set_title('Actual Data') plot_next_place(prev_test, y_test, ids_test, ax=axs[0]) axs[1].set_title('RF Predicted') plot_next_place(prev_test, y_pred, ids_test, ax=axs[1]) # fig1.savefig('actual.png', tight=True) # fig2.savefig('RF predicted.png', tight=True) plt.show()
def main(): # df = data.read_visited_key_points('Fri', grouped=True, extra=['category']) # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] # df = df[df['category'].isin(categories)].sort_values('Timestamp') # # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last() # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first() categories = [ 'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping' ] x, y, prev, ids = pp.get_bag_data(['Fri'], 14, categories, return_prev=True, return_ids=True) # discard the day data because we only have 1 day ids = ids['group_id'].values # clamp x values to 1 or 0 x = (x > 0).astype('int64') x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = ( cross_validation.train_test_split(x, y, prev, ids, train_size=0.25, random_state=2294967295)) print('Predicting') #################################random forest################################## predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) predictor.fit(x_train, y_train) y_pred1 = predictor.predict(x_test) ################################RNN################################## # Build model: a single direction GRU with a single layer classifier = learn.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE, n_classes=82, cell_type='gru', input_op_fn=input_op_fn, num_layers=1, bidirectional=False, sequence_length=None, steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True) # print(x_train) # print(y_train) # Train and predict classifier.fit(x_train, y_train, steps=1000) y_pred2 = classifier.predict(x_test) print('Plotting') fig1, ax1 = plt.subplots() fig2, ax2 = plt.subplots() fig3, ax3 = plt.subplots() axs = [ax1, ax2, ax3] # sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]] # max_size = max(sizes) axs[0].set_title('Actual Data') plot_next_place(prev_test, y_test, ids_test, ax=axs[0]) axs[1].set_title('RNN Predicted') plot_next_place(prev_test, y_pred2, ids_test, ax=axs[1]) axs[2].set_title('RF Predicted') plot_next_place(prev_test, y_pred1, ids_test, ax=axs[2]) fig1.savefig('actual.png', tight=True) fig2.savefig('RNN predicted.png', tight=True) plt.show()