Esempio n. 1
0
def main():
    for day in data.days:
        print('Most popular category ({}):'.format(day))
        print(
            get_most_common(day,
                            'category',
                            ignore_categories=['Restrooms',
                                               'Entry/Exit']).value_counts())
        print()
    for day in data.days:
        print('Most popular attraction ({}):'.format(day))
        most_common = get_most_common(
            day, 'place_id', ignore_categories=['Restrooms',
                                                'Entry/Exit']).value_counts()
        most_common = pd.DataFrame(most_common)
        most_common = most_common.reset_index()
        most_common.columns = ['place_id', 'count']
        # merge with place id to get category and name
        kp = data.read_key_points()
        most_common = pd.merge(most_common,
                               kp.loc[:, ['place_id', 'category', 'name']],
                               on='place_id',
                               sort=False)
        print(most_common)
        print()
def plot_key_points(ax):
    palette = data.palette20
    df = data.read_key_points()
    for i, (name, group) in enumerate(df.groupby(by='category', sort=False)):
        # the colour input has some issues here when the size is three
        ax.scatter(group.X, group.Y, s=100, c=palette[i], label=name, lw=0)
    ax.legend(loc='lower left', scatterpoints=1, ncol=2, fontsize=8)
def plot_next_place(prev, next, ids, ax=None, max_size=None):
    if ax is None:
        fig, ax = plt.subplots()

    kp = data.read_key_points().set_index('place_id')
    group_info = data.read_group_info('Fri').set_index('group_id')

    # places = pd.DataFrame(data={'prev': prev, 'next': next}).dropna().astype('int64')
    places = pd.DataFrame(data={'prev': prev, 'next': next}, index=ids)
    # drop any rows with 0 for the place id, as we can't plot that.
    places = places.loc[(places != 0).all(axis=1)]
    places['size'] = group_info['size']
    p2 = places.groupby(['next',
                         'prev']).sum().reset_index().sort_values('size')
    # remove the small slices
    # p2 = p2[p2['size'] >= 8]
    if max_size is None:
        max_size = p2['size'].max()
        # print(max_size)

    im = data.read_image('Grey')
    ax.imshow(im, extent=[0, 100, 0, 100])

    cmap = plt.get_cmap('plasma')
    for i, row in enumerate(p2.itertuples()):
        # index_amt = i / (len(p2) - 1)
        size_amt = row.size / max_size
        prev_xy = kp.loc[row.prev, ['X', 'Y']].values
        next_xy = kp.loc[row.next, ['X', 'Y']].values
        arrowprops = {
            'arrowstyle': 'simple',
            'mutation_scale': 50 * size_amt,
            'alpha': 0.2 + 0.8 * size_amt,
            'lw': 0,
            'color': cmap(0.5 * size_amt),
            'connectionstyle': "arc3,rad=-0.1"
        }
        ax.annotate('', xy=next_xy, xytext=prev_xy, arrowprops=arrowprops)
    ax.xaxis.set_ticks([])
    ax.yaxis.set_ticks([])
def plot_next_place(prev, next, ids, ax=None, max_size=None):
    if ax is None:
        fig, ax = plt.subplots()

    kp = data.read_key_points().set_index('place_id')
    group_info = data.read_group_info('Fri').set_index('group_id')

    # places = pd.DataFrame(data={'prev': prev, 'next': next}).dropna().astype('int64')
    places = pd.DataFrame(data={'prev': prev, 'next': next}, index=ids)
    # drop any rows with 0 for the place id, as we can't plot that.
    places = places.loc[(places != 0).all(axis=1)]
    places['size'] = group_info['size']
    p2 = places.groupby(['next', 'prev']).sum().reset_index().sort_values('size')
    # remove the small slices
    # p2 = p2[p2['size'] >= 8]
    if max_size is None:
        max_size = p2['size'].max()
        # print(max_size)

    im = data.read_image('Grey')
    ax.imshow(im, extent=[0, 100, 0, 100])

    cmap = plt.get_cmap('plasma')
    for i, row in enumerate(p2.itertuples()):
        # index_amt = i / (len(p2) - 1)
        size_amt = row.size / max_size
        prev_xy = kp.loc[row.prev, ['X', 'Y']].values
        next_xy = kp.loc[row.next, ['X', 'Y']].values
        arrowprops = {'arrowstyle': 'simple',
                      'mutation_scale': 50 * size_amt,
                      'alpha': 0.2 + 0.8 * size_amt,
                      'lw': 0,
                      'color': cmap(0.5 * size_amt),
                      'connectionstyle': "arc3,rad=-0.1"}
        ax.annotate('', xy=next_xy, xytext=prev_xy, arrowprops=arrowprops)
    ax.xaxis.set_ticks([])
    ax.yaxis.set_ticks([])
Esempio n. 5
0
import data
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

kp = data.read_key_points()

pos = data.read_position_totals()

join_column = 'place_id'
# join_column = 'category'
ts = pd.date_range('2014-06-06 08:00:00', '2014-06-08 23:55:00', freq='1T')
# change x and y coordinates to place ids
rides = (pd.merge(pos,
                  kp.loc[:, ['X', 'Y', join_column]],
                  on=['X', 'Y'],
                  how='left',
                  sort=False).drop(['X', 'Y'], 1))

# total amount of minutes spends at a destination across the whole day (approximately)
ride_totals = (rides.dropna().groupby(join_column).sum().sort_values(
    'total', ascending=False))

cutoff = ride_totals.iloc[17]['total']  # the 18th most ride is at index 17
cutoff_ids = ride_totals[ride_totals['total'] >= cutoff].index
cutoff_ids = np.append(cutoff_ids, [0, -1])

rides = rides.fillna(-1)
# join places below cut-off into one group
rides.loc[~rides[join_column].isin(cutoff_ids), join_column] = 0
Esempio n. 6
0
import data
import pandas as pd

attraction_totals = None
category_totals = None
for day in data.days:
    print(day)
    df = data.read_visited_key_points(day, ['category'])

    day_attraction_totals = pd.DataFrame(columns=['total'], data=df.groupby('place_id').size())
    if attraction_totals is None:
        attraction_totals = day_attraction_totals
    else:
        attraction_totals.add(day_attraction_totals, fill_value=0)

    day_category_totals = pd.DataFrame(columns=['total'], data=df.groupby('category').size())
    if category_totals is None:
        category_totals = day_category_totals
    else:
        category_totals.add(day_category_totals, fill_value=0)

kp = data.read_key_points().set_index('place_id')
attraction_totals['name'] = kp['name']
attraction_totals.sort_values(by='total', ascending=False, inplace=True)
category_totals.sort_values(by='total', ascending=False, inplace=True)

print(attraction_totals)
print(category_totals)

def main():
    kp = data.read_key_points()
    categories = kp['category']
    # categories = categories[~categories.isin(['Restrooms', 'Entry/Exit'])].unique()
    categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    # predictor_names = ['Decision Tree',
    #                    # 'Gradient Boosting',
    #                    'Random Forest',
    #                    'MultinomialNB',
    #                    'BernoulliNB',
    #                    # 'KNN',
    #                    'Random',
    #                    'Most Frequent',
    #                    'Uniform']
    predictor_names = all_predictors.keys()

    # times = np.arange(9, 22)
    times = timedelta(hours=1) * np.linspace(9, 22, 20)
    accuracies = {}

    for i, cutoff in enumerate(times):
        print('cutoff {}'.format(cutoff))

        # Preprocessing
        print('Preprocessing')
        # x, y = get_sequence_data(['Sat'], cutoff, categories=categories)
        # x = x[:,:2]

        x, y = get_bag_data(['Sat'], cutoff, categories=categories)
        # change the input stuff to be only 1's and 0's
        x = (x > 0).astype('int64')


        x_train, x_test, y_train, y_test = (
            cross_validation.train_test_split(x, y, train_size=0.9, random_state=2294967295)
        )

        # x_train, y_train = get_bag_data(['Fri', 'Sat'], cutoff, categories=categories)
        # x_test, y_test = get_bag_data(['Sun'], cutoff, categories=categories)

        # Predicting
        print('Predicting')
        for name in predictor_names:
            predictor = all_predictors[name]
            if name not in accuracies:
                accuracies[name] = {
                    'accuracy': np.zeros(len(times)),
                    'log_loss': np.zeros(len(times))
                }

            print('  {}'.format(name))

            if(name == 'RNN'):
                #reset the RNN model
                predictor = learn.TensorFlowRNNClassifier(
                    rnn_size=EMBEDDING_SIZE, n_classes=82, cell_type='gru',
                    input_op_fn=input_op_fn, num_layers=1, bidirectional=False,
                    sequence_length=None, steps=1000, optimizer='Adam',
                    learning_rate=0.01, continue_training=True)

                predictor.fit(x_train, y_train, steps=100)
                print("get you RNN")

            elif(name == 'DNN'):
                #reset the DNN model
                predictor = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=82)
                predictor.fit(x_train, y_train, steps=100)
                print("get you DNN")

            else:
                predictor.fit(x_train, y_train)

            accuracies[name]['accuracy'][i] = get_accuracy_score(predictor, x_test, y_test)
            # accuracies[name]['log_loss'][i] = np.exp(-get_log_loss_score(predictor, x_test, y_test))

    fig, axs = plt.subplots(1, 2)
    fig.suptitle('With data of all types')
    colours = data.palette10
    for score, ax in zip(['accuracy', 'log_loss'], axs):
        for name, color in zip(predictor_names, colours):
            # acc = untrained_accuracies[name] - untrained_accuracies['Random']
            acc = accuracies[name][score]

            ax.plot(times / timedelta(hours=1), acc, c=color)  # , marker='o', markeredgewidth=0, markersize=4)
            ax.set_title('Accuracy using {} scoring metric'.format(score))
            ax.set_ylabel(score)
            ax.set_xlabel('Time of day')
            # ax.set_ylim([-0.5, 0.5])
            # ax.set_ylim([0, 1])
        ax.legend([mpatches.Patch(color=colours[i]) for i in range(len(predictor_names))],
                  predictor_names, prop={'size': 8}, loc="best")

    # fig, axs = plt.subplots(2, 3)
    # for cutoff, ax in zip([10, 12, 14, 16, 18, 20], axs.flat):
    #     total_accuracies, untrained_accuracies = test_accuracy(
    #             get_bag_data, predictor_names, predictors, ['Fri', 'Sat', 'Sun'], cutoff, categories)
    #
    #     for name, color in zip(predictor_names, data.palette10):
    #         # acc = untrained_accuracies[name] - untrained_accuracies['Random']
    #         acc = untrained_accuracies[name]
    #         ixs = np.arange(len(acc))
    #         ax.plot(ixs+1, acc, c=color, marker='o', markeredgewidth=0)
    #         ax.set_title('Prediction accuracy at {}:00'.format(cutoff))
    #         ax.set_ylabel('Chance at least one prediction is correct')
    #         ax.set_xlabel('Number of predictions')
    #         # ax.set_ylim([-0.5, 0.5])
    #         ax.set_ylim([0, 1])
    #     ax.legend([mpatches.Patch(color=data.palette10[i]) for i in range(len(predictor_names))],
    #               predictor_names, prop={'size': 8}, loc="best")
    plt.show()
Esempio n. 8
0
def get_key_points(day):
    df = data.read_data_with_timespans(day)
    df = df[df.timespan >= 30]
    kp = data.read_key_points()
    merged = pd.merge(df, kp, how='inner', on=['X', 'Y'], sort=False)
    return merged
Esempio n. 9
0
def main():
    kp = data.read_key_points()
    categories = kp['category']
    # categories = categories[~categories.isin(['Restrooms', 'Entry/Exit'])].unique()
    categories = [
        'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone',
        'Shows & Entertainment', 'Shopping'
    ]
    # predictor_names = ['Decision Tree',
    #                    # 'Gradient Boosting',
    #                    'Random Forest',
    #                    'MultinomialNB',
    #                    'BernoulliNB',
    #                    # 'KNN',
    #                    'Random',
    #                    'Most Frequent',
    #                    'Uniform']
    predictor_names = all_predictors.keys()

    # times = np.arange(9, 22)
    times = timedelta(hours=1) * np.linspace(9, 22, 20)
    accuracies = {}

    for i, cutoff in enumerate(times):
        print('cutoff {}'.format(cutoff))

        # Preprocessing
        print('Preprocessing')
        # x, y = get_sequence_data(['Sat'], cutoff, categories=categories)
        # x = x[:,:2]

        x, y = get_bag_data(['Sat'], cutoff, categories=categories)
        # change the input stuff to be only 1's and 0's
        x = (x > 0).astype('int64')

        x_train, x_test, y_train, y_test = (cross_validation.train_test_split(
            x, y, train_size=0.9, random_state=2294967295))

        # x_train, y_train = get_bag_data(['Fri', 'Sat'], cutoff, categories=categories)
        # x_test, y_test = get_bag_data(['Sun'], cutoff, categories=categories)

        # Predicting
        print('Predicting')
        for name in predictor_names:
            predictor = all_predictors[name]
            if name not in accuracies:
                accuracies[name] = {
                    'accuracy': np.zeros(len(times)),
                    'log_loss': np.zeros(len(times))
                }

            print('  {}'.format(name))

            if (name == 'RNN'):
                #reset the RNN model
                predictor = learn.TensorFlowRNNClassifier(
                    rnn_size=EMBEDDING_SIZE,
                    n_classes=82,
                    cell_type='gru',
                    input_op_fn=input_op_fn,
                    num_layers=1,
                    bidirectional=False,
                    sequence_length=None,
                    steps=1000,
                    optimizer='Adam',
                    learning_rate=0.01,
                    continue_training=True)

                predictor.fit(x_train, y_train, steps=1000)
                print("get you RNN")

            elif (name == 'DNN'):
                #reset the DNN model
                predictor = learn.DNNClassifier(hidden_units=[10, 20, 10],
                                                n_classes=82)
                predictor.fit(x_train, y_train, steps=1000)
                print("get you DNN")

            else:
                predictor.fit(x_train, y_train)

            accuracies[name]['accuracy'][i] = get_accuracy_score(
                predictor, x_test, y_test)
            # accuracies[name]['log_loss'][i] = np.exp(-get_log_loss_score(predictor, x_test, y_test))

    fig, axs = plt.subplots(1, 2)
    fig.suptitle('With data of all types')
    colours = data.palette10
    for score, ax in zip(['accuracy', 'log_loss'], axs):
        for name, color in zip(predictor_names, colours):
            # acc = untrained_accuracies[name] - untrained_accuracies['Random']
            acc = accuracies[name][score]

            ax.plot(times / timedelta(hours=1), acc,
                    c=color)  # , marker='o', markeredgewidth=0, markersize=4)
            ax.set_title('Accuracy using {} scoring metric'.format(score))
            ax.set_ylabel(score)
            ax.set_xlabel('Time of day')
            # ax.set_ylim([-0.5, 0.5])
            # ax.set_ylim([0, 1])
        ax.legend([
            mpatches.Patch(color=colours[i])
            for i in range(len(predictor_names))
        ],
                  predictor_names,
                  prop={'size': 8},
                  loc="best")

    # fig, axs = plt.subplots(2, 3)
    # for cutoff, ax in zip([10, 12, 14, 16, 18, 20], axs.flat):
    #     total_accuracies, untrained_accuracies = test_accuracy(
    #             get_bag_data, predictor_names, predictors, ['Fri', 'Sat', 'Sun'], cutoff, categories)
    #
    #     for name, color in zip(predictor_names, data.palette10):
    #         # acc = untrained_accuracies[name] - untrained_accuracies['Random']
    #         acc = untrained_accuracies[name]
    #         ixs = np.arange(len(acc))
    #         ax.plot(ixs+1, acc, c=color, marker='o', markeredgewidth=0)
    #         ax.set_title('Prediction accuracy at {}:00'.format(cutoff))
    #         ax.set_ylabel('Chance at least one prediction is correct')
    #         ax.set_xlabel('Number of predictions')
    #         # ax.set_ylim([-0.5, 0.5])
    #         ax.set_ylim([0, 1])
    #     ax.legend([mpatches.Patch(color=data.palette10[i]) for i in range(len(predictor_names))],
    #               predictor_names, prop={'size': 8}, loc="best")
    plt.show()
import data
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import pandas as pd


kp = data.read_key_points()
common_categories = kp['category']
common_categories = common_categories[~common_categories.isin(['Restrooms', 'Entry/Exit'])].unique()


def get_sequence_data(days=None, cutoff_time=12, categories=None, return_prev=False, return_ids=False):
    xs = []
    ys = []
    prevs = []
    ids = []

    print('Getting data')
    for day in days:
        print(day)
        df = data.read_visited_key_points(day, extra=['category'], grouped=True)
        if categories is not None:
            df = df[df['category'].isin(categories)]

        first_time = df['Timestamp'].min()
        if isinstance(cutoff_time, timedelta):
            cutoff = datetime(first_time.year, first_time.month, first_time.day) + cutoff_time
        else:
            cutoff = datetime(first_time.year, first_time.month, first_time.day, cutoff_time)
        df_pre = df[df['Timestamp'] <= cutoff].sort_values('Timestamp').copy()