Example #1
0
def random_kfold_validation(model, n_splits=3):
    """
    Shows some training and validation results, for a random kfold validation
    scheme.
    Args:
        model(sklearn.BaseEstimator): The model to fit and make predictions.
        n_splits(int): The number of folds for the cross-validation.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2018)
    X_train_val, X_test, y_train_val, y_test, \
    encoder = sd.get_success_data(drop_time=True)

    # Train and validate for each fold
    f1_train = list()
    f1_val = list()
    i = 0
    for train_index, test_index in skf.split(X_train_val, y_train_val):
        i += 1
        print('Fold - {}'.format(i))
        X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[
            test_index]
        y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[
            test_index]
        model, y_train_pred, y_val_pred = evaluate_model(
            model, X_train, X_val, y_train, y_val)
        f1_train.append(f1_score(y_train, y_train_pred))
        f1_val.append(f1_score(y_val, y_val_pred))

    # Show results
    print('Training F1-score: {} +- {}'.format(np.mean(f1_train),
                                               np.std(f1_train)))
    print()
    print('Validation F1-score: {} +- {}'.format(np.mean(f1_val),
                                                 2 * np.std(f1_val)))
Example #2
0
def get_time_split_val(val_time=370, **kwargs):
    """
    Returns all the datasets necessary to perform a time-split validation.
    Args:
        val_time(int): The time to make the validation split.
        kwargs(dict): Arguments to be passed to inner functions.

    Returns:
        X_train(pd.DataFrame): Training features.
        X_val(pd.DataFrame): Validation features.
        X_test(pd.DataFrame): Test features.
        X_train_val(pd.DataFrame): Training + Validation features, to use when
            testing.
        y_train(pd.Series): Training target values.
        y_val(pd.Series): Validation target values.
        y_test(pd.Series): Test target values.
        y_train_val(pd.Series): Training + Validation target values, to use
        when testing.
    """

    fun_kwargs = utils.filter_args(sd.get_success_data, kwargs)
    X_train_val, \
    X_test, \
    y_train_val, \
    y_test, \
    encoder = sd.get_success_data(drop_time=False, **fun_kwargs)
    X_test = pp.drop_time_dependent(X_test)
    X_train, X_val, y_train, y_val = sd.time_split(X_train_val, y_train_val,
                                                   val_time)
    return X_train, X_val, X_test, X_train_val, y_train, y_val, y_test, \
           y_train_val
Example #3
0
def random_1fold_cust_validation(model, **kwargs):
    """
    Shows some training and validation results, for a random train-val-test
    validation scheme. The dataset is divided by customers.
    Args:
        model(sklearn.BaseEstimator): The model to fit and make predictions.
    """
    X_train_val, X_test, y_train_val, y_test, encoder = sd.get_success_data(
        drop_time=True, anon=False, **kwargs)

    # Get random customer splits
    val_size = 0.3
    customers = X_train_val.person.unique()
    n_train = int(np.floor(customers.shape[0] * (1.0 - val_size)))
    np.random.shuffle(customers)
    X_train = X_train_val[X_train_val.person.isin(customers[:n_train])]
    X_val = X_train_val[X_train_val.person.isin(customers[n_train:])]
    y_train = y_train_val[X_train_val.person.isin(customers[:n_train])]
    y_val = y_train_val[X_train_val.person.isin(customers[n_train:])]

    # Anonimize
    X_train = pp.anonimize_data(X_train)
    X_val = pp.anonimize_data(X_val)

    # Evaluate and show results
    model, y_train_pred, y_val_pred = evaluate_model(model, X_train, X_val,
                                                     y_train, y_val)
    print('Training F1-score: {}'.format(f1_score(y_train, y_train_pred)))
    print()
    print('Validation F1-score: {}'.format(f1_score(y_val, y_val_pred)))
Example #4
0
def offer_success_test(model, **kwargs):
    """
    Shows some training and test results, for a time-split validation scheme.
    Args:
        model(sklearn.BaseEstimator): The model to fit and make predictions.
    """
    X_train, X_test, y_train, y_test, encoder = sd.get_success_data(**kwargs)
    model, y_train_pred, y_test_pred = evaluate_model(model, X_train, X_test,
                                                      y_train, y_test)
    print('Training F1-score: {}'.format(f1_score(y_train, y_train_pred)))
    print()
    print('Test F1-score: {}'.format(f1_score(y_test, y_test_pred)))
Example #5
0
def random_1fold_validation(model, **kwargs):
    """
    Shows some training and validation results, for a random train-val-test
    validation scheme.
    Args:
        model(sklearn.BaseEstimator): The model to fit and make predictions.
    """
    X_train_val, X_test, y_train_val, y_test, encoder = sd.get_success_data(
        drop_time=True, **kwargs)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                      y_train_val,
                                                      test_size=0.3,
                                                      random_state=2018)
    model, y_train_pred, y_val_pred = evaluate_model(model, X_train, X_val,
                                                     y_train, y_val)
    print('Training F1-score: {}'.format(f1_score(y_train, y_train_pred)))
    print()
    print('Validation F1-score: {}'.format(f1_score(y_val, y_val_pred)))
def create_cluster_feats_4d(static_dataset_path=os.path.join(DATA_INTERIM, 'static_data.pkl'),
                            output_path=os.path.join(DATA_PROCESSED, 'static_cluster1.pkl'),
                            save=True):
    """
    Adds the features created by clustering for the selected 4D cases (age, income, gender, memeber_since_epoch).
    The features to add are: kmeans_8, ward_12 and dbscan_10.
    Args:
        static_dataset_path(str): The path to the static dataset to be taken as the initial data.
        output_path(str): The path to save the new dataset.
        save(boolean): Whether to save the new static dataset.
    Returns:
        static_cluster1_dataset(dataframe): The same as the static dataset but with the features added into new
            columns.
        X_train_r(dataframe): X_train (as obtained from time-split with the input static data) with the new features.
        X_test_r(dataframe): X_test (as obtained from time-split with the input static data) with the new features.
        y_train(pd.Series): y_train as obtained from time-split with the input static data.
        y_test(pd.Series): y_test as obtained from time-split with the input static data.
    """
    # Get the data
    X_train, X_test, y_train, y_test, encoder = sd.get_success_data(basic_dataset_path=static_dataset_path,
                                                                    drop_time=False,
                                                                    anon=False)

    # Encode and filter relevant features
    customer_feats = ['age', 'gender', 'income', 'missing_demographics',
                      'member_epoch_days']

    X_train_t = encoder.fit_transform(X_train)
    X_train_t = X_train_t[customer_feats]
    X_test_t = encoder.transform(X_test)
    X_test_t = X_test_t[customer_feats]

    # Drop duplicates and missing data
    X_train_t = X_train_t.dropna().drop_duplicates()
    X_test_t = X_test_t.dropna().drop_duplicates()

    # Keep a copy with the original demographics
    X_train_o = pp.gender_decode(X_train_t.copy())
    X_test_o = pp.gender_decode(X_test_t.copy())

    # Drop the irrelevant column
    X_train_t = X_train_t.drop('missing_demographics', axis=1)
    X_test_t = X_test_t.drop('missing_demographics', axis=1)

    # Normalize
    scaler = StandardScaler()
    scaler.fit(X_train_t)

    X_train_t = pd.DataFrame(scaler.transform(X_train_t),
                             index=X_train_t.index,
                             columns=X_train_t.columns)
    X_test_t = pd.DataFrame(scaler.transform(X_test_t),
                            index=X_test_t.index,
                            columns=X_test_t.columns)

    # Add the clustering labels
    # K-Means (k = 8)
    n_clusters = 8
    kmeans = KMeans(n_clusters=n_clusters, random_state=2018)
    kmeans.fit(X_train_t)
    X_train_o['kmeans_8'] = kmeans.predict(X_train_t)
    X_test_o['kmeans_8'] = kmeans.predict(X_test_t)

    # Ward 12 clusters
    linkage_matrix = ward(X_train_t)
    dist_12 = DIST_12
    X_train_o['ward_12'] = fcluster(linkage_matrix, dist_12, criterion='distance')
    # Use KNN to determine the test clusters
    knn_ward = KNeighborsClassifier(n_neighbors=5)
    knn_ward.fit(X_train_t, X_train_o['ward_12'])
    X_test_o['ward_12'] = knn_ward.predict(X_test_t)

    # DBSCAN eps=0.3, min_samples=20, 10 clusters
    eps = 0.3
    min_samples = 20
    dbs = DBSCAN(eps=eps, min_samples=min_samples)
    dbs.fit(X_train_t)
    X_train_o['dbscan_10'] = dbs.labels_
    # Use KNN to determine the test clusters
    knn_dbscan = KNeighborsClassifier(n_neighbors=5)
    knn_dbscan.fit(X_train_t, X_train_o['dbscan_10'])
    X_test_o['dbscan_10'] = knn_dbscan.predict(X_test_t)

    # Merge with the original datsets
    X_train_r = X_train.merge(X_train_o, on=customer_feats, how='left')
    X_test_r = X_test.merge(X_test_o, on=customer_feats, how='left')

    # Join the new features with the old static dataset
    static_cluster1 = pd.concat([X_train_r.sort_values(by='time'), X_test_r.sort_values(by='time')])
    old_static = pd.read_pickle(static_dataset_path)
    id_feats = ['person', 'time', 'offer_id']
    cluster_feats = ['kmeans_8', 'ward_12', 'dbscan_10']
    cluster_info = static_cluster1[id_feats + cluster_feats]
    static_cluster1_dataset = old_static.merge(cluster_info, on=id_feats)

    # Save the new static dataset
    if save:
        static_cluster1_dataset.to_pickle(output_path)

    return static_cluster1_dataset, X_train_r, X_test_r, y_train, y_test
def create_cluster_feats_3d(static_dataset_path=os.path.join(DATA_PROCESSED, 'static_cluster1.pkl'),
                            output_path=os.path.join(DATA_PROCESSED, 'static_cluster3d.pkl'),
                            save=True):
    """
    Adds the features created by clustering for the selected 3D cases (age, income, memeber_since_epoch).
    The features to add are: 3d_kmeans_3, 3d_ward_3, 3d_ward_19, 3d_gmm_3, 3d_gmm_16, 3d_dbscan_02_20, 3d_dbscan_05_100
    Args:
        static_dataset_path(str): The path to the static dataset to be taken as the initial data.
        output_path(str): The path to save the new dataset.
        save(boolean): Whether to save the new static dataset.
    Returns:
        static_cluster3d_dataset(dataframe): The same as the static dataset but with the features added into new
            columns.
        X_train_r(dataframe): X_train (as obtained from time-split with the input static data) with the new features.
        X_test_r(dataframe): X_test (as obtained from time-split with the input static data) with the new features.
        y_train(pd.Series): y_train as obtained from time-split with the input static data.
        y_test(pd.Series): y_test as obtained from time-split with the input static data.
    """
    # Get the data
    X_train, X_test, y_train, y_test, encoder = sd.get_success_data(
        basic_dataset_path=static_dataset_path,
        drop_time=False,
        anon=False)

    # Encode and filter relevant features
    customer_feats = ['age', 'income', 'missing_demographics',
                      'member_epoch_days']

    X_train_t = encoder.fit_transform(X_train)
    X_train_t = X_train_t[customer_feats]
    X_test_t = encoder.transform(X_test)
    X_test_t = X_test_t[customer_feats]

    # Drop duplicates and missing data
    X_train_t = X_train_t.dropna().drop_duplicates()
    X_test_t = X_test_t.dropna().drop_duplicates()

    # Keep a copy with the original demographics
    X_train_o = X_train_t.copy()
    X_test_o = X_test_t.copy()

    # Drop the irrelevant column
    X_train_t = X_train_t.drop('missing_demographics', axis=1)
    X_test_t = X_test_t.drop('missing_demographics', axis=1)

    # Normalize
    scaler = StandardScaler()
    scaler.fit(X_train_t)

    X_train_t = pd.DataFrame(scaler.transform(X_train_t),
                             index=X_train_t.index,
                             columns=X_train_t.columns)
    X_test_t = pd.DataFrame(scaler.transform(X_test_t),
                            index=X_test_t.index,
                            columns=X_test_t.columns)

    # Add the clustering labels
    # K-Means (k = 3)
    n_clusters = 3
    kmeans = KMeans(n_clusters=n_clusters, random_state=2018)
    kmeans.fit(X_train_t)
    X_train_o['3d_kmeans_3'] = kmeans.predict(X_train_t)
    X_test_o['3d_kmeans_3'] = kmeans.predict(X_test_t)

    # Ward
    linkage_matrix = ward(X_train_t)

    # Ward 3 clusters
    n_clusters = 3
    feat_name = '3d_ward_3'
    dist = DIST_3D_3
    X_train_o[feat_name] = fcluster(linkage_matrix, dist, criterion='distance')
    # Use KNN to determine the test clusters
    knn_ward = KNeighborsClassifier(n_neighbors=5)
    knn_ward.fit(X_train_t, X_train_o[feat_name])
    X_test_o[feat_name] = knn_ward.predict(X_test_t)

    # Ward 9 clusters
    n_clusters = 9
    feat_name = '3d_ward_9'
    dist = DIST_3D_9
    X_train_o[feat_name] = fcluster(linkage_matrix, dist, criterion='distance')
    # Use KNN to determine the test clusters
    knn_ward = KNeighborsClassifier(n_neighbors=5)
    knn_ward.fit(X_train_t, X_train_o[feat_name])
    X_test_o[feat_name] = knn_ward.predict(X_test_t)

    # Ward 19 clusters
    n_clusters = 19
    feat_name = '3d_ward_19'
    dist = DIST_3D_19
    X_train_o[feat_name] = fcluster(linkage_matrix, dist, criterion='distance')
    # Use KNN to determine the test clusters
    knn_ward = KNeighborsClassifier(n_neighbors=5)
    knn_ward.fit(X_train_t, X_train_o[feat_name])
    X_test_o[feat_name] = knn_ward.predict(X_test_t)

    # GMM 3 clusters
    gmm = GaussianMixture(n_components=3)
    gmm.fit(X_train_t)
    X_train_o['3d_gmm_3'] = gmm.predict(X_train_t)
    X_test_o['3d_gmm_3'] = gmm.predict(X_test_t)

    # GMM 16 clusters
    gmm = GaussianMixture(n_components=16)
    gmm.fit(X_train_t)
    X_train_o['3d_gmm_16'] = gmm.predict(X_train_t)
    X_test_o['3d_gmm_16'] = gmm.predict(X_test_t)

    # DBSCAN eps=0.2, min_samples=20
    eps = 0.2
    min_samples = 20
    feat_name = '3d_dbscan_02_20'
    dbs = DBSCAN(eps=eps, min_samples=min_samples)
    dbs.fit(X_train_t)
    X_train_o[feat_name] = dbs.labels_
    # Use KNN to determine the test clusters
    knn_dbscan = KNeighborsClassifier(n_neighbors=5)
    knn_dbscan.fit(X_train_t, X_train_o[feat_name])
    X_test_o[feat_name] = knn_dbscan.predict(X_test_t)

    # DBSCAN eps=0.5, min_samples=100
    eps = 0.5
    min_samples = 100
    feat_name = '3d_dbscan_05_100'
    dbs = DBSCAN(eps=eps, min_samples=min_samples)
    dbs.fit(X_train_t)
    X_train_o[feat_name] = dbs.labels_
    # Use KNN to determine the test clusters
    knn_dbscan = KNeighborsClassifier(n_neighbors=5)
    knn_dbscan.fit(X_train_t, X_train_o[feat_name])
    X_test_o[feat_name] = knn_dbscan.predict(X_test_t)

    # Merge with the original datsets
    X_train_r = X_train.merge(X_train_o, on=customer_feats, how='left')
    X_test_r = X_test.merge(X_test_o, on=customer_feats, how='left')

    # Join the new features with the old static dataset
    cluster_feats = ['3d_kmeans_3', '3d_ward_3', '3d_ward_9', '3d_ward_19',
                     '3d_gmm_3', '3d_gmm_16', '3d_dbscan_02_20', '3d_dbscan_05_100']
    static_cluster3d = pd.concat([X_train_r.sort_values(by='time'), X_test_r.sort_values(by='time')])
    old_static = pd.read_pickle(static_dataset_path)
    id_feats = ['person', 'time', 'offer_id']
    cluster_info = static_cluster3d[id_feats + cluster_feats]
    static_cluster3d_dataset = old_static.merge(cluster_info, on=id_feats)

    # Save the new static dataset
    if save:
        static_cluster3d_dataset.to_pickle(output_path)

    return static_cluster3d_dataset, X_train_r, X_test_r, y_train, y_test
Example #8
0
def main():
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    static_dataset_path = os.path.join(DATA_INTERIM, 'static_data.pkl')
    static_cluster1_path = os.path.join(DATA_PROCESSED, 'static_cluster1.pkl')
    static_cluster3d_path = os.path.join(DATA_PROCESSED,
                                         'static_cluster3d.pkl')
    static_lagged_path = os.path.join(DATA_PROCESSED,
                                      'static_cluster_lagged.pkl')
    static_spent_10_days = os.path.join(DATA_PROCESSED,
                                        'static_spent_10_days.pkl')

    logger = logging.getLogger(__name__)
    logger.info(
        'Making the final datasets from raw data (the entire process can take about 1 hour, more or less, '
        'depending on the computational resources available)')

    # Load the raw data
    print('data raw is here:')
    print(os.path.join(DATA_RAW, 'portfolio.json'))
    portfolio = pd.read_json(os.path.join(DATA_RAW, 'portfolio.json'),
                             orient='records',
                             lines=True)
    profile = pd.read_json(os.path.join(DATA_RAW, 'profile.json'),
                           orient='records',
                           lines=True)
    transcript = pd.read_json(os.path.join(DATA_RAW, 'transcript.json'),
                              orient='records',
                              lines=True)

    # Initial preprocessing
    logger.info('Preprocessing...')
    data, portfolio = pp.basic_preprocessing(portfolio, profile, transcript)

    # Generate the static dataset, and save it
    logger.info('Generating the static dataset. ' +
                'This may take several minutes...')
    static_data = pp.generate_static_dataset(data)
    static_data.to_pickle(static_dataset_path)

    # Add the 4D clustering features
    logger.info('Generating the 4D clustering features')
    clust.create_cluster_feats_4d(static_dataset_path=static_dataset_path,
                                  output_path=static_cluster1_path,
                                  save=True)

    # Add the 3D clustering features
    logger.info('Generating the 3D clustering features')
    clust.create_cluster_feats_3d(static_dataset_path=static_cluster1_path,
                                  output_path=static_cluster3d_path,
                                  save=True)

    # Add the lagged features
    logger.info('Generating the Lagged features')
    portfolio = pd.read_json(os.path.join(DATA_RAW, 'portfolio.json'),
                             orient='records',
                             lines=True)
    static_data = pd.read_pickle(static_cluster3d_path)
    data_lag = lag.fill_lagged_success(static_data, portfolio)
    data_lag.to_pickle(static_lagged_path)

    # Create the offer-success datasets and save them
    logger.info('Creating the offer-success datsets...')
    X_train_sd, \
    X_test_sd, \
    y_train_sd, \
    y_test_sd, \
    encoder_sd = sd.get_success_data(basic_dataset_path=static_lagged_path)
    X_train_sd.to_pickle(os.path.join(DATA_PROCESSED, 'X_train_success.pkl'))
    X_test_sd.to_pickle(os.path.join(DATA_PROCESSED, 'X_test_success.pkl'))
    y_train_sd.to_pickle(os.path.join(DATA_PROCESSED, 'y_train_success.pkl'))
    y_test_sd.to_pickle(os.path.join(DATA_PROCESSED, 'y_test_success.pkl'))
    with open(os.path.join(DATA_PROCESSED, 'encoder_success.pkl'),
              'wb') as file:
        pickle.dump(encoder_sd, file)

    # Create spent-10-days static dataset
    logger.info('Creating the spent-10-days static datset')
    static_data = pd.read_pickle(static_lagged_path)
    filled = p10.get_spent_days_static(static_data, data)
    filled.to_pickle(static_spent_10_days)

    # Create the profit-10-days datasets and save them
    logger.info('Creating the profit-10-days datsets...')
    X_train_p10,\
    X_test_p10,\
    y_train_p10,\
    y_test_p10,\
    encoder_p10,\
    view_cols_p10,\
    profit_cols_p10 = p10.get_profit_10_days_data(basic_dataset_path=static_spent_10_days,
                                                  fill_null=True,
                                                  target=['viewed', 'profit_10_days'],
                                                  drop_offer_id=False)
    X_train_p10.to_pickle(os.path.join(DATA_PROCESSED, 'X_train_profits.pkl'))
    X_test_p10.to_pickle(os.path.join(DATA_PROCESSED, 'X_test_profits.pkl'))
    y_train_p10.to_pickle(os.path.join(DATA_PROCESSED, 'y_train_profits.pkl'))
    y_test_p10.to_pickle(os.path.join(DATA_PROCESSED, 'y_test_profits.pkl'))
    with open(os.path.join(DATA_PROCESSED, 'encoder_profits.pkl'),
              'wb') as file:
        pickle.dump(encoder_p10, file)
    with open(os.path.join(DATA_PROCESSED, 'view_cols_profits.pkl'),
              'wb') as file:
        pickle.dump(view_cols_p10, file)
    with open(os.path.join(DATA_PROCESSED, 'profit_cols_profits.pkl'),
              'wb') as file:
        pickle.dump(profit_cols_p10, file)

    logger.info('All the datasets were created successfully!')