Beispiel #1
0
def preprocessing(pd_dataframe):
    """Preprocessing the data
    :param
        pd_dataframe: the raw data dataframe
    :return:
        the preprocessed dataframe
    """
    pd_dataframe = pd_dataframe.sample(frac=1)

    # remove outliers
    no_outliers_pd_dataframe = remove_outliers(pd_dataframe)

    # balance dataframe
    pd_balanced_dataframe = balance_data(no_outliers_pd_dataframe)

    return pd_balanced_dataframe
Beispiel #2
0
def build_features(name="features.pkl"):
    # Load data
    items, shops, cats, train, test = load_data()
    train = remove_outliers(train)
    train = remove_negative_prices(train)
    train = remove_duplicates(train)
    test = remove_duplicates(test)
    shops = split_city_name(shops)
    cats = split_type_subtype(cats)

    # Name is not used
    items.drop(['item_name'], axis=1, inplace=True)
    # Compute revenue
    train['revenue'] = train['item_price'] *  train['item_cnt_day']
    
    matrix = create_matrix(train)
    matrix = fill_and_clip_matrix(matrix, train)
    matrix = append_test_to_matrix(matrix, test)
    matrix = merge_in_shops_items_cats(matrix, shops, items, cats)

    matrix = compute_item_cnt_month(matrix)
    matrix = compute_date_avg_item_cnt(matrix)
    matrix = compute_date_item_avg_item_cnt(matrix)
    matrix = compute_date_shop_avg_item_cnt(matrix)
    matrix = compute_date_cat_avg_item_cnt(matrix)
    matrix = compute_date_shop_cat_avg_item_cnt(matrix)
    matrix = compute_date_shop_type_avg_item_cnt(matrix)
    matrix = compute_date_shop_subtype_avg_item_cnt(matrix)
    matrix = compute_date_city_avg_item_cnt(matrix)
    matrix = compute_date_item_city_avg_item_cnt(matrix)
    matrix = compute_date_type_avg_item_cnt(matrix)
    matrix = compute_date_subtype_avg_item_cnt(matrix)
    matrix = compute_price_trends(matrix, train)
    matrix = compute_shop_trends(matrix, train)
    matrix = add_month_and_days(matrix)
    matrix = compute_item_shop_last_sale(matrix)
    matrix = compute_item_last_sale(matrix)
    matrix = compute_item_shop_first_sale(matrix)
    matrix = compute_item_first_sale(matrix)

    matrix = discard_first_year(matrix)
    matrix = fill_na(matrix)
    matrix.to_pickle(name)
Beispiel #3
0
test_data = pd.read_csv(test_input, header=0)

# Rename Income column to match test data
train_data = train_data.rename(columns={'Income in EUR': 'Income'})

# Perform local testing using only train data
if (test):
    train_data, test_data = train_test_split(train_data, test_size=0.2)

# Drop unnecessary columns
train_data = train_data.drop(unnecessary, axis=1)
test_data = test_data.drop(unnecessary, axis=1)

# Remove outliers from training data
if (outliers):
    train_data = remove_outliers(train_data)

# Quantify the datasets
train_length = train_data.shape[0]
data = train_data.append(test_data)
data = quantify_data(data)
train_data = data[:train_length]
test_data = data[train_length:]

# Split the data into training/testing sets
train_x = train_data.drop(["Income"], axis=1)
test_x = test_data.drop(["Income"], axis=1)

# Split the targets into training/testing sets
train_y = train_data["Income"]
test_y = test_data["Income"]
Beispiel #4
0
def main():

    #### STEP 0: Load Data and Set Constants ####
    #
    # upload file
    #
    file = '/Users/ischoning/PycharmProjects/GitHub/data/varjo_events_12_0_0.txt'
    file = '/Users/ischoning/PycharmProjects/GitHub/data/hdf5/events_1_eyelink/data_collection_events_eyetracker_MonocularEyeSampleEvent.csv'
    #
    # create dataframe
    #
    df = preprocessing.format(file)
    #
    # set constants depending on file type (varjo or eyelink)
    #
    if 'eyelink' in file:
        eye = 'monocular'
        #
        # set thresholds and window sizes for testing
        #
        window_sizes = (100, 125, 150)  # number of samples (ms)
        threshes = (
            1, 1.5
        )  # 40.4 pixels in 1 deg (overleaf doc sacVelocity.py)  # using 1 deg from Pieter Blignaut's paper: Fixation identification: "The optimum threshold for a dispersion algorithm"
        #
        # set best dispersion threshold and window size
        #
        best_window_size = 150
        best_thresh = 1
        # this means that in order to be classified as a fixation, dispersion
        # should not go above 1 degree for at least 100 ms (which corresponds
        # to 100 samples for eyelink data at 1000Hz sample rate)
        #
    else:
        #
        # choose eye data to analyze {'left eye' or 'right eye'}
        #
        eye = 'left eye'
        #
        # set thresholds and window sizes for testing
        #
        window_sizes = (15, 20, 25)  # number of samples
        threshes = (0.5, 1.0
                    )  # 40.4 pixels in 1 deg (overleaf doc sacVelocity.py)
        #
        # set best dispersion threshold and window size
        #
        best_window_size = 20
        best_thresh = 0.5
        # this means that in order to be classified as a fixation, dispersion
        # should not go above 0.5 degrees for at least 200 ms (which corresponds
        # to 20 samples for varjo data at 100Hz sample rate)
        #

    #### STEP 1: Clean Outliers ####
    #
    df = preprocessing.remove_outliers(df)
    #
    # instantiate data according to eye, selected above
    #
    x, y, v, a = preprocessing.get_feats(df, eye)
    df['x'] = x
    df['y'] = y
    df['v'] = v
    df['a'] = a
    #
    # plot results after removing outliers
    #
    # scatter plot of movement (x vs y) in degrees
    plots.plot_path(df)
    # behavior (x, y, velocity over time)
    plots.plot_vs_time(df, label='Velocity', eye=eye)

    #### STEP 2: Filter Fixations Using Dispersion (I-DT) Algorithm ####
    #
    # plot variations of window_size and threshold
    # best window size and threshold set in step 0
    #
    plots.plot_IDT_thresh_results(df.copy(), window_sizes,
                                  threshes)  # threshes defined in Step 0
    #
    # classify fixations using I-DT
    #
    df = label_fixes(df.copy(),
                     eye=eye,
                     ws=best_window_size,
                     thresh=best_thresh,
                     method='IDT')
    #
    # show sequence of events
    #
    seq = pd.DataFrame(sequence(df))
    fix = seq[seq.State == 'fix']
    print("================= I-DT RESULTS =====================")
    print("Fix Duration_ms < window (150ms):",
          np.sum(np.where(fix.Duration_ms < 150, 1, 0)))
    print("Fix Amplitude > thresh (1 deg):",
          np.sum(np.where(fix.Amplitude > 1, 1, 0)))
    print("Fix Sequence:")
    print(fix)
    print("=============================================================")

    #### STEP 3A: Filter Saccades Using Velocity Threshold ####
    #
    # find modal intersample velocity for fixations (if more than one mode, use mean)
    # (round to 1 decimal first)
    #
    try:
        fix_mode_v = statistics.mode(np.round(df[df.event == 'fix'].v, 1))
        kw = 'modal'
    except:
        fix_mode_v = np.average(df[df.event == 'fix'].v)
        kw = 'mean'
    print(kw, "intersample velocity for fixations:", fix_mode_v)
    #
    # take margin of error above the mode as velocity based threshold
    #
    error = abs(df[df.v > fix_mode_v].v - fix_mode_v) / fix_mode_v
    print("total margin of error above fix_mode_v:", np.average(error))
    thresh = fix_mode_v * np.average(error) * 2
    #
    # compare to margin of error within fixations
    #
    error_fix = abs(df[df.event == 'fix'].v - fix_mode_v) / fix_mode_v
    print("fix margin of error:", np.average(error_fix))
    #
    # classify as saccade if intersample velocity > fix_mode_v * 2 * error
    #
    df_copy = df.copy()
    df_copy['event'] = np.where(df_copy.v > thresh, 'sac', df_copy.event)
    #
    # relabel the remaining 'other' as smooth pursuit
    #
    df_copy['event'] = np.where(df_copy.event == 'other', 'smp', df_copy.event)
    #
    # plot resulting classification
    #
    plots.plot_vs_time(df_copy,
                       label='Velocity',
                       eye=eye,
                       classify=True,
                       method='IVT')
    #
    # # saccade if intersample velocity > 22 deg/s (Houpt)
    # df_copy = df.copy()
    # df_copy['event'] = np.where(df_copy.v > 22, 'sac', df_copy.event)
    # # relabel the remaining 'other' as smooth pursuit
    # df_copy['event'] = np.where(df_copy.event == 'other', 'smp', df_copy.event)
    # # plots.plot_events(df_copy,eye,'IVT')
    # plots.plot_vs_time(df_copy, label='Velocity', eye=eye, classify=True, method='IVT')
    #
    # print event sequence results
    #
    seq = pd.DataFrame(sequence(df_copy))
    fix = seq[seq.State == 'fix']
    smp = seq[seq.State == 'smp']
    sac = seq[seq.State == 'sac']
    print("================= I-VT RESULTS =====================")
    print("Num Fix Events:", len(fix))
    print("Num SmP Events:", len(smp))
    print("Num Sac Events:", len(sac))
    print("=============================================================")
    print("Fix Sequence (Carpenter):")
    print(fix)
    print("SmP Sequence (Carpenter):")
    print(smp)
    print("Sac Sequence (Carpenter):")
    print(sac)
    print("=============================================================")

    #### STEP 3B: Filter Saccades Using Carpenter's Theorem ####
    #
    # create sequence of events
    seq = pd.DataFrame(sequence(df))
    #
    # plot amplitude and velocity of non-fixes along with ideal from Carpenter's Thm: D = 21 + 2.2A, [D~ms, A~deg]
    #
    other = seq[seq.State == 'other']
    plt.scatter(other.Amplitude, other.Duration_ms, label="other")
    x = linspace(min(other.Amplitude), max(other.Amplitude))
    y = 21 + 2.2 * x  # Carpenter's Theorem
    plt.plot(x, y, color='green', label='D = 21 + 2.2A')
    head = '[' + eye + ' eye] Other: Amplitude vs Duration'
    plt.title(head)
    plt.xlabel('amplitude (deg)')
    plt.ylabel('duration (ms)')
    plt.legend()
    plt.show()
    #
    # calculate error rate
    #
    seq['error'] = abs(seq.Duration_ms -
                       (21 + 2.2 * seq.Amplitude)) / (21 + 2.2 * seq.Amplitude)
    #
    # classify other into saccade or smooth pursuit depending on error rate with Carpenter's Theorem
    # If actual is greater than 10% of true then classify as smp, otherwise sac.
    #
    seq['State'] = np.where(seq.State == 'other',
                            np.where(seq.error < 0.2, 'sac', seq.State),
                            seq.State)
    #
    # relabel the remaining 'other' as smooth pursuit
    #
    seq['State'] = np.where(seq.State == 'other', 'smp', seq.State)
    #
    # remap seq State to the dataframe df
    #
    for i in range(len(seq)):
        df.loc[seq.start[i]:seq.end[i], 'event'] = seq.State[i]
    #
    # plot result
    #
    plots.plot_vs_time(df,
                       label='Velocity',
                       eye=eye,
                       classify=True,
                       method='Carpenter')
    #
    # print event sequence results
    #
    fix = seq[seq.State == 'fix']
    smp = seq[seq.State == 'smp']
    sac = seq[seq.State == 'sac']
    print("================= CARPENTER RESULTS =====================")
    print("Num Fix Events:", len(fix))
    print("Num SmP Events:", len(smp))
    print("Num Sac Events:", len(sac))
    print("=============================================================")
    print("Fix Sequence (Carpenter):")
    print(fix)
    print("SmP Sequence (Carpenter):")
    print(smp)
    print("Sac Sequence (Carpenter):")
    print(sac)
    print("=============================================================")

    #### STEP 4: Plot resulting histograms and statistics ####
    #
    # output final stats
    #
    print("==== CLASSIFICATION RESULTS (I-DT followed by Carpenter) ====")
    print("average intersample velocity:")
    print("    raw:", np.round(np.average(df.v), 3))
    print("    fix:", np.round(np.average(df[df.event == 'fix'].v), 3))
    print("    smp:", np.round(np.average(df[df.event == 'smp'].v), 3))
    print("    sac:", np.round(np.average(df[df.event == 'sac'].v), 3))
    #
    print("modal intersample velocity:")
    print("    raw:", find_mode(df.v))
    print("    fix:", find_mode(df[df.event == 'fix'].v))
    print("    smp:", find_mode(df[df.event == 'smp'].v))
    print("    sac:", find_mode(df[df.event == 'sac'].v))
    #
    print("velocity standard deviation:")
    print("    raw:", np.round(df.v.std(), 3))
    print("    fix:", np.round(df[df.event == 'fix'].v.std(), 3))
    print("    smp:", np.round(df[df.event == 'smp'].v.std(), 3))
    print("    sac:", np.round(df[df.event == 'sac'].v.std(), 3))
    #
    # plot velocity histograms by classification
    #
    plots.plot_vel_hist(df.copy(),
                        eye=eye,
                        title='Velocity Histogram: All',
                        density=True,
                        classify=True)
    #
    # plot carpenter error histogram by classification
    #
    seq.loc[:, 'error'] = np.round(seq.error, 1)
    num_bins = range(int(math.floor(np.min(seq.error))),
                     int(math.ceil(np.max(seq.error))), 1)
    common_params = dict(bins=num_bins,
                         color=('green', 'orange', 'blue'),
                         label=('fixation', 'smooth pursuit', 'saccade'),
                         alpha=0.6,
                         density=True)
    plt.hist((fix.error, smp.error, sac.error), **common_params)
    plt.title('Carpenter Error')
    plt.legend()
    plt.ylabel('Frequency')
    plt.xlabel('Error')
    plt.show()
    #
    print("============ ERROR STATS ============")
    print("mean error:")
    print("    raw:", np.round(np.average(seq.error), 3))
    print("    fix:", np.round(np.average(fix.error), 3))
    print("    smp:", np.round(np.average(smp.error), 3))
    print("    sac:", np.round(np.average(sac.error), 3))
    #
    print("mode error:")
    print("    raw:", find_mode(seq.error))
    print("    fix:", find_mode(fix.error))
    print("    smp:", find_mode(smp.error))
    print("    sac:", find_mode(sac.error))
    #
    print("std error:")
    print("    raw:", np.round(seq.error.std(), 3))
    print("    fix:", np.round(fix.error.std(), 3))
    print("    smp:", np.round(smp.error.std(), 3))
    print("    sac:", np.round(sac.error.std(), 3))
Beispiel #5
0
import utils
import preprocessing
import feature_engineering
import modelling
import pandas as pd

# Load data
items, cats, shops, train, test = utils.load_data()

###### Preproccessing
train = preprocessing.remove_outliers(train)
train = preprocessing.negative_prices_to_itemmeanprice(train)
train, test = preprocessing.merge_shop_duplicate_references(shops, train, test)

###### Shop name correction
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"',
          'shop_name'] = 'СергиевПосад ТЦ "7Я"'

###### Feature engineering
# Add new 'city' feature
shops = feature_engineering.add_shop_city_attr(shops)

# Add type and suptype feature
cats = feature_engineering.add_category_type_subtype_attr(cats)

# Replace city, type, subtype with importancy weights
shops, items, cats = feature_engineering.add_city_type_subtype_imp_attr(
    train, shops, items, cats)

# Generate a matrix with all possible combinations of month, shop & item ids
matrix = utils.get_comb_matrix(train)