def main(): for day in data.days: print('Most popular category ({}):'.format(day)) print( get_most_common(day, 'category', ignore_categories=['Restrooms', 'Entry/Exit']).value_counts()) print() for day in data.days: print('Most popular attraction ({}):'.format(day)) most_common = get_most_common( day, 'place_id', ignore_categories=['Restrooms', 'Entry/Exit']).value_counts() most_common = pd.DataFrame(most_common) most_common = most_common.reset_index() most_common.columns = ['place_id', 'count'] # merge with place id to get category and name kp = data.read_key_points() most_common = pd.merge(most_common, kp.loc[:, ['place_id', 'category', 'name']], on='place_id', sort=False) print(most_common) print()
def plot_key_points(ax): palette = data.palette20 df = data.read_key_points() for i, (name, group) in enumerate(df.groupby(by='category', sort=False)): # the colour input has some issues here when the size is three ax.scatter(group.X, group.Y, s=100, c=palette[i], label=name, lw=0) ax.legend(loc='lower left', scatterpoints=1, ncol=2, fontsize=8)
def plot_next_place(prev, next, ids, ax=None, max_size=None): if ax is None: fig, ax = plt.subplots() kp = data.read_key_points().set_index('place_id') group_info = data.read_group_info('Fri').set_index('group_id') # places = pd.DataFrame(data={'prev': prev, 'next': next}).dropna().astype('int64') places = pd.DataFrame(data={'prev': prev, 'next': next}, index=ids) # drop any rows with 0 for the place id, as we can't plot that. places = places.loc[(places != 0).all(axis=1)] places['size'] = group_info['size'] p2 = places.groupby(['next', 'prev']).sum().reset_index().sort_values('size') # remove the small slices # p2 = p2[p2['size'] >= 8] if max_size is None: max_size = p2['size'].max() # print(max_size) im = data.read_image('Grey') ax.imshow(im, extent=[0, 100, 0, 100]) cmap = plt.get_cmap('plasma') for i, row in enumerate(p2.itertuples()): # index_amt = i / (len(p2) - 1) size_amt = row.size / max_size prev_xy = kp.loc[row.prev, ['X', 'Y']].values next_xy = kp.loc[row.next, ['X', 'Y']].values arrowprops = { 'arrowstyle': 'simple', 'mutation_scale': 50 * size_amt, 'alpha': 0.2 + 0.8 * size_amt, 'lw': 0, 'color': cmap(0.5 * size_amt), 'connectionstyle': "arc3,rad=-0.1" } ax.annotate('', xy=next_xy, xytext=prev_xy, arrowprops=arrowprops) ax.xaxis.set_ticks([]) ax.yaxis.set_ticks([])
def plot_next_place(prev, next, ids, ax=None, max_size=None): if ax is None: fig, ax = plt.subplots() kp = data.read_key_points().set_index('place_id') group_info = data.read_group_info('Fri').set_index('group_id') # places = pd.DataFrame(data={'prev': prev, 'next': next}).dropna().astype('int64') places = pd.DataFrame(data={'prev': prev, 'next': next}, index=ids) # drop any rows with 0 for the place id, as we can't plot that. places = places.loc[(places != 0).all(axis=1)] places['size'] = group_info['size'] p2 = places.groupby(['next', 'prev']).sum().reset_index().sort_values('size') # remove the small slices # p2 = p2[p2['size'] >= 8] if max_size is None: max_size = p2['size'].max() # print(max_size) im = data.read_image('Grey') ax.imshow(im, extent=[0, 100, 0, 100]) cmap = plt.get_cmap('plasma') for i, row in enumerate(p2.itertuples()): # index_amt = i / (len(p2) - 1) size_amt = row.size / max_size prev_xy = kp.loc[row.prev, ['X', 'Y']].values next_xy = kp.loc[row.next, ['X', 'Y']].values arrowprops = {'arrowstyle': 'simple', 'mutation_scale': 50 * size_amt, 'alpha': 0.2 + 0.8 * size_amt, 'lw': 0, 'color': cmap(0.5 * size_amt), 'connectionstyle': "arc3,rad=-0.1"} ax.annotate('', xy=next_xy, xytext=prev_xy, arrowprops=arrowprops) ax.xaxis.set_ticks([]) ax.yaxis.set_ticks([])
import data import pandas as pd import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np kp = data.read_key_points() pos = data.read_position_totals() join_column = 'place_id' # join_column = 'category' ts = pd.date_range('2014-06-06 08:00:00', '2014-06-08 23:55:00', freq='1T') # change x and y coordinates to place ids rides = (pd.merge(pos, kp.loc[:, ['X', 'Y', join_column]], on=['X', 'Y'], how='left', sort=False).drop(['X', 'Y'], 1)) # total amount of minutes spends at a destination across the whole day (approximately) ride_totals = (rides.dropna().groupby(join_column).sum().sort_values( 'total', ascending=False)) cutoff = ride_totals.iloc[17]['total'] # the 18th most ride is at index 17 cutoff_ids = ride_totals[ride_totals['total'] >= cutoff].index cutoff_ids = np.append(cutoff_ids, [0, -1]) rides = rides.fillna(-1) # join places below cut-off into one group rides.loc[~rides[join_column].isin(cutoff_ids), join_column] = 0
import data import pandas as pd attraction_totals = None category_totals = None for day in data.days: print(day) df = data.read_visited_key_points(day, ['category']) day_attraction_totals = pd.DataFrame(columns=['total'], data=df.groupby('place_id').size()) if attraction_totals is None: attraction_totals = day_attraction_totals else: attraction_totals.add(day_attraction_totals, fill_value=0) day_category_totals = pd.DataFrame(columns=['total'], data=df.groupby('category').size()) if category_totals is None: category_totals = day_category_totals else: category_totals.add(day_category_totals, fill_value=0) kp = data.read_key_points().set_index('place_id') attraction_totals['name'] = kp['name'] attraction_totals.sort_values(by='total', ascending=False, inplace=True) category_totals.sort_values(by='total', ascending=False, inplace=True) print(attraction_totals) print(category_totals)
def main(): kp = data.read_key_points() categories = kp['category'] # categories = categories[~categories.isin(['Restrooms', 'Entry/Exit'])].unique() categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping'] # predictor_names = ['Decision Tree', # # 'Gradient Boosting', # 'Random Forest', # 'MultinomialNB', # 'BernoulliNB', # # 'KNN', # 'Random', # 'Most Frequent', # 'Uniform'] predictor_names = all_predictors.keys() # times = np.arange(9, 22) times = timedelta(hours=1) * np.linspace(9, 22, 20) accuracies = {} for i, cutoff in enumerate(times): print('cutoff {}'.format(cutoff)) # Preprocessing print('Preprocessing') # x, y = get_sequence_data(['Sat'], cutoff, categories=categories) # x = x[:,:2] x, y = get_bag_data(['Sat'], cutoff, categories=categories) # change the input stuff to be only 1's and 0's x = (x > 0).astype('int64') x_train, x_test, y_train, y_test = ( cross_validation.train_test_split(x, y, train_size=0.9, random_state=2294967295) ) # x_train, y_train = get_bag_data(['Fri', 'Sat'], cutoff, categories=categories) # x_test, y_test = get_bag_data(['Sun'], cutoff, categories=categories) # Predicting print('Predicting') for name in predictor_names: predictor = all_predictors[name] if name not in accuracies: accuracies[name] = { 'accuracy': np.zeros(len(times)), 'log_loss': np.zeros(len(times)) } print(' {}'.format(name)) if(name == 'RNN'): #reset the RNN model predictor = learn.TensorFlowRNNClassifier( rnn_size=EMBEDDING_SIZE, n_classes=82, cell_type='gru', input_op_fn=input_op_fn, num_layers=1, bidirectional=False, sequence_length=None, steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True) predictor.fit(x_train, y_train, steps=100) print("get you RNN") elif(name == 'DNN'): #reset the DNN model predictor = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=82) predictor.fit(x_train, y_train, steps=100) print("get you DNN") else: predictor.fit(x_train, y_train) accuracies[name]['accuracy'][i] = get_accuracy_score(predictor, x_test, y_test) # accuracies[name]['log_loss'][i] = np.exp(-get_log_loss_score(predictor, x_test, y_test)) fig, axs = plt.subplots(1, 2) fig.suptitle('With data of all types') colours = data.palette10 for score, ax in zip(['accuracy', 'log_loss'], axs): for name, color in zip(predictor_names, colours): # acc = untrained_accuracies[name] - untrained_accuracies['Random'] acc = accuracies[name][score] ax.plot(times / timedelta(hours=1), acc, c=color) # , marker='o', markeredgewidth=0, markersize=4) ax.set_title('Accuracy using {} scoring metric'.format(score)) ax.set_ylabel(score) ax.set_xlabel('Time of day') # ax.set_ylim([-0.5, 0.5]) # ax.set_ylim([0, 1]) ax.legend([mpatches.Patch(color=colours[i]) for i in range(len(predictor_names))], predictor_names, prop={'size': 8}, loc="best") # fig, axs = plt.subplots(2, 3) # for cutoff, ax in zip([10, 12, 14, 16, 18, 20], axs.flat): # total_accuracies, untrained_accuracies = test_accuracy( # get_bag_data, predictor_names, predictors, ['Fri', 'Sat', 'Sun'], cutoff, categories) # # for name, color in zip(predictor_names, data.palette10): # # acc = untrained_accuracies[name] - untrained_accuracies['Random'] # acc = untrained_accuracies[name] # ixs = np.arange(len(acc)) # ax.plot(ixs+1, acc, c=color, marker='o', markeredgewidth=0) # ax.set_title('Prediction accuracy at {}:00'.format(cutoff)) # ax.set_ylabel('Chance at least one prediction is correct') # ax.set_xlabel('Number of predictions') # # ax.set_ylim([-0.5, 0.5]) # ax.set_ylim([0, 1]) # ax.legend([mpatches.Patch(color=data.palette10[i]) for i in range(len(predictor_names))], # predictor_names, prop={'size': 8}, loc="best") plt.show()
def get_key_points(day): df = data.read_data_with_timespans(day) df = df[df.timespan >= 30] kp = data.read_key_points() merged = pd.merge(df, kp, how='inner', on=['X', 'Y'], sort=False) return merged
def main(): kp = data.read_key_points() categories = kp['category'] # categories = categories[~categories.isin(['Restrooms', 'Entry/Exit'])].unique() categories = [ 'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping' ] # predictor_names = ['Decision Tree', # # 'Gradient Boosting', # 'Random Forest', # 'MultinomialNB', # 'BernoulliNB', # # 'KNN', # 'Random', # 'Most Frequent', # 'Uniform'] predictor_names = all_predictors.keys() # times = np.arange(9, 22) times = timedelta(hours=1) * np.linspace(9, 22, 20) accuracies = {} for i, cutoff in enumerate(times): print('cutoff {}'.format(cutoff)) # Preprocessing print('Preprocessing') # x, y = get_sequence_data(['Sat'], cutoff, categories=categories) # x = x[:,:2] x, y = get_bag_data(['Sat'], cutoff, categories=categories) # change the input stuff to be only 1's and 0's x = (x > 0).astype('int64') x_train, x_test, y_train, y_test = (cross_validation.train_test_split( x, y, train_size=0.9, random_state=2294967295)) # x_train, y_train = get_bag_data(['Fri', 'Sat'], cutoff, categories=categories) # x_test, y_test = get_bag_data(['Sun'], cutoff, categories=categories) # Predicting print('Predicting') for name in predictor_names: predictor = all_predictors[name] if name not in accuracies: accuracies[name] = { 'accuracy': np.zeros(len(times)), 'log_loss': np.zeros(len(times)) } print(' {}'.format(name)) if (name == 'RNN'): #reset the RNN model predictor = learn.TensorFlowRNNClassifier( rnn_size=EMBEDDING_SIZE, n_classes=82, cell_type='gru', input_op_fn=input_op_fn, num_layers=1, bidirectional=False, sequence_length=None, steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True) predictor.fit(x_train, y_train, steps=1000) print("get you RNN") elif (name == 'DNN'): #reset the DNN model predictor = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=82) predictor.fit(x_train, y_train, steps=1000) print("get you DNN") else: predictor.fit(x_train, y_train) accuracies[name]['accuracy'][i] = get_accuracy_score( predictor, x_test, y_test) # accuracies[name]['log_loss'][i] = np.exp(-get_log_loss_score(predictor, x_test, y_test)) fig, axs = plt.subplots(1, 2) fig.suptitle('With data of all types') colours = data.palette10 for score, ax in zip(['accuracy', 'log_loss'], axs): for name, color in zip(predictor_names, colours): # acc = untrained_accuracies[name] - untrained_accuracies['Random'] acc = accuracies[name][score] ax.plot(times / timedelta(hours=1), acc, c=color) # , marker='o', markeredgewidth=0, markersize=4) ax.set_title('Accuracy using {} scoring metric'.format(score)) ax.set_ylabel(score) ax.set_xlabel('Time of day') # ax.set_ylim([-0.5, 0.5]) # ax.set_ylim([0, 1]) ax.legend([ mpatches.Patch(color=colours[i]) for i in range(len(predictor_names)) ], predictor_names, prop={'size': 8}, loc="best") # fig, axs = plt.subplots(2, 3) # for cutoff, ax in zip([10, 12, 14, 16, 18, 20], axs.flat): # total_accuracies, untrained_accuracies = test_accuracy( # get_bag_data, predictor_names, predictors, ['Fri', 'Sat', 'Sun'], cutoff, categories) # # for name, color in zip(predictor_names, data.palette10): # # acc = untrained_accuracies[name] - untrained_accuracies['Random'] # acc = untrained_accuracies[name] # ixs = np.arange(len(acc)) # ax.plot(ixs+1, acc, c=color, marker='o', markeredgewidth=0) # ax.set_title('Prediction accuracy at {}:00'.format(cutoff)) # ax.set_ylabel('Chance at least one prediction is correct') # ax.set_xlabel('Number of predictions') # # ax.set_ylim([-0.5, 0.5]) # ax.set_ylim([0, 1]) # ax.legend([mpatches.Patch(color=data.palette10[i]) for i in range(len(predictor_names))], # predictor_names, prop={'size': 8}, loc="best") plt.show()
import data import numpy as np from datetime import datetime, timedelta import matplotlib.pyplot as plt import pandas as pd kp = data.read_key_points() common_categories = kp['category'] common_categories = common_categories[~common_categories.isin(['Restrooms', 'Entry/Exit'])].unique() def get_sequence_data(days=None, cutoff_time=12, categories=None, return_prev=False, return_ids=False): xs = [] ys = [] prevs = [] ids = [] print('Getting data') for day in days: print(day) df = data.read_visited_key_points(day, extra=['category'], grouped=True) if categories is not None: df = df[df['category'].isin(categories)] first_time = df['Timestamp'].min() if isinstance(cutoff_time, timedelta): cutoff = datetime(first_time.year, first_time.month, first_time.day) + cutoff_time else: cutoff = datetime(first_time.year, first_time.month, first_time.day, cutoff_time) df_pre = df[df['Timestamp'] <= cutoff].sort_values('Timestamp').copy()