def random_kfold_validation(model, n_splits=3): """ Shows some training and validation results, for a random kfold validation scheme. Args: model(sklearn.BaseEstimator): The model to fit and make predictions. n_splits(int): The number of folds for the cross-validation. """ skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2018) X_train_val, X_test, y_train_val, y_test, \ encoder = sd.get_success_data(drop_time=True) # Train and validate for each fold f1_train = list() f1_val = list() i = 0 for train_index, test_index in skf.split(X_train_val, y_train_val): i += 1 print('Fold - {}'.format(i)) X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[ test_index] y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[ test_index] model, y_train_pred, y_val_pred = evaluate_model( model, X_train, X_val, y_train, y_val) f1_train.append(f1_score(y_train, y_train_pred)) f1_val.append(f1_score(y_val, y_val_pred)) # Show results print('Training F1-score: {} +- {}'.format(np.mean(f1_train), np.std(f1_train))) print() print('Validation F1-score: {} +- {}'.format(np.mean(f1_val), 2 * np.std(f1_val)))
def get_time_split_val(val_time=370, **kwargs): """ Returns all the datasets necessary to perform a time-split validation. Args: val_time(int): The time to make the validation split. kwargs(dict): Arguments to be passed to inner functions. Returns: X_train(pd.DataFrame): Training features. X_val(pd.DataFrame): Validation features. X_test(pd.DataFrame): Test features. X_train_val(pd.DataFrame): Training + Validation features, to use when testing. y_train(pd.Series): Training target values. y_val(pd.Series): Validation target values. y_test(pd.Series): Test target values. y_train_val(pd.Series): Training + Validation target values, to use when testing. """ fun_kwargs = utils.filter_args(sd.get_success_data, kwargs) X_train_val, \ X_test, \ y_train_val, \ y_test, \ encoder = sd.get_success_data(drop_time=False, **fun_kwargs) X_test = pp.drop_time_dependent(X_test) X_train, X_val, y_train, y_val = sd.time_split(X_train_val, y_train_val, val_time) return X_train, X_val, X_test, X_train_val, y_train, y_val, y_test, \ y_train_val
def random_1fold_cust_validation(model, **kwargs): """ Shows some training and validation results, for a random train-val-test validation scheme. The dataset is divided by customers. Args: model(sklearn.BaseEstimator): The model to fit and make predictions. """ X_train_val, X_test, y_train_val, y_test, encoder = sd.get_success_data( drop_time=True, anon=False, **kwargs) # Get random customer splits val_size = 0.3 customers = X_train_val.person.unique() n_train = int(np.floor(customers.shape[0] * (1.0 - val_size))) np.random.shuffle(customers) X_train = X_train_val[X_train_val.person.isin(customers[:n_train])] X_val = X_train_val[X_train_val.person.isin(customers[n_train:])] y_train = y_train_val[X_train_val.person.isin(customers[:n_train])] y_val = y_train_val[X_train_val.person.isin(customers[n_train:])] # Anonimize X_train = pp.anonimize_data(X_train) X_val = pp.anonimize_data(X_val) # Evaluate and show results model, y_train_pred, y_val_pred = evaluate_model(model, X_train, X_val, y_train, y_val) print('Training F1-score: {}'.format(f1_score(y_train, y_train_pred))) print() print('Validation F1-score: {}'.format(f1_score(y_val, y_val_pred)))
def offer_success_test(model, **kwargs): """ Shows some training and test results, for a time-split validation scheme. Args: model(sklearn.BaseEstimator): The model to fit and make predictions. """ X_train, X_test, y_train, y_test, encoder = sd.get_success_data(**kwargs) model, y_train_pred, y_test_pred = evaluate_model(model, X_train, X_test, y_train, y_test) print('Training F1-score: {}'.format(f1_score(y_train, y_train_pred))) print() print('Test F1-score: {}'.format(f1_score(y_test, y_test_pred)))
def random_1fold_validation(model, **kwargs): """ Shows some training and validation results, for a random train-val-test validation scheme. Args: model(sklearn.BaseEstimator): The model to fit and make predictions. """ X_train_val, X_test, y_train_val, y_test, encoder = sd.get_success_data( drop_time=True, **kwargs) X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=2018) model, y_train_pred, y_val_pred = evaluate_model(model, X_train, X_val, y_train, y_val) print('Training F1-score: {}'.format(f1_score(y_train, y_train_pred))) print() print('Validation F1-score: {}'.format(f1_score(y_val, y_val_pred)))
def create_cluster_feats_4d(static_dataset_path=os.path.join(DATA_INTERIM, 'static_data.pkl'), output_path=os.path.join(DATA_PROCESSED, 'static_cluster1.pkl'), save=True): """ Adds the features created by clustering for the selected 4D cases (age, income, gender, memeber_since_epoch). The features to add are: kmeans_8, ward_12 and dbscan_10. Args: static_dataset_path(str): The path to the static dataset to be taken as the initial data. output_path(str): The path to save the new dataset. save(boolean): Whether to save the new static dataset. Returns: static_cluster1_dataset(dataframe): The same as the static dataset but with the features added into new columns. X_train_r(dataframe): X_train (as obtained from time-split with the input static data) with the new features. X_test_r(dataframe): X_test (as obtained from time-split with the input static data) with the new features. y_train(pd.Series): y_train as obtained from time-split with the input static data. y_test(pd.Series): y_test as obtained from time-split with the input static data. """ # Get the data X_train, X_test, y_train, y_test, encoder = sd.get_success_data(basic_dataset_path=static_dataset_path, drop_time=False, anon=False) # Encode and filter relevant features customer_feats = ['age', 'gender', 'income', 'missing_demographics', 'member_epoch_days'] X_train_t = encoder.fit_transform(X_train) X_train_t = X_train_t[customer_feats] X_test_t = encoder.transform(X_test) X_test_t = X_test_t[customer_feats] # Drop duplicates and missing data X_train_t = X_train_t.dropna().drop_duplicates() X_test_t = X_test_t.dropna().drop_duplicates() # Keep a copy with the original demographics X_train_o = pp.gender_decode(X_train_t.copy()) X_test_o = pp.gender_decode(X_test_t.copy()) # Drop the irrelevant column X_train_t = X_train_t.drop('missing_demographics', axis=1) X_test_t = X_test_t.drop('missing_demographics', axis=1) # Normalize scaler = StandardScaler() scaler.fit(X_train_t) X_train_t = pd.DataFrame(scaler.transform(X_train_t), index=X_train_t.index, columns=X_train_t.columns) X_test_t = pd.DataFrame(scaler.transform(X_test_t), index=X_test_t.index, columns=X_test_t.columns) # Add the clustering labels # K-Means (k = 8) n_clusters = 8 kmeans = KMeans(n_clusters=n_clusters, random_state=2018) kmeans.fit(X_train_t) X_train_o['kmeans_8'] = kmeans.predict(X_train_t) X_test_o['kmeans_8'] = kmeans.predict(X_test_t) # Ward 12 clusters linkage_matrix = ward(X_train_t) dist_12 = DIST_12 X_train_o['ward_12'] = fcluster(linkage_matrix, dist_12, criterion='distance') # Use KNN to determine the test clusters knn_ward = KNeighborsClassifier(n_neighbors=5) knn_ward.fit(X_train_t, X_train_o['ward_12']) X_test_o['ward_12'] = knn_ward.predict(X_test_t) # DBSCAN eps=0.3, min_samples=20, 10 clusters eps = 0.3 min_samples = 20 dbs = DBSCAN(eps=eps, min_samples=min_samples) dbs.fit(X_train_t) X_train_o['dbscan_10'] = dbs.labels_ # Use KNN to determine the test clusters knn_dbscan = KNeighborsClassifier(n_neighbors=5) knn_dbscan.fit(X_train_t, X_train_o['dbscan_10']) X_test_o['dbscan_10'] = knn_dbscan.predict(X_test_t) # Merge with the original datsets X_train_r = X_train.merge(X_train_o, on=customer_feats, how='left') X_test_r = X_test.merge(X_test_o, on=customer_feats, how='left') # Join the new features with the old static dataset static_cluster1 = pd.concat([X_train_r.sort_values(by='time'), X_test_r.sort_values(by='time')]) old_static = pd.read_pickle(static_dataset_path) id_feats = ['person', 'time', 'offer_id'] cluster_feats = ['kmeans_8', 'ward_12', 'dbscan_10'] cluster_info = static_cluster1[id_feats + cluster_feats] static_cluster1_dataset = old_static.merge(cluster_info, on=id_feats) # Save the new static dataset if save: static_cluster1_dataset.to_pickle(output_path) return static_cluster1_dataset, X_train_r, X_test_r, y_train, y_test
def create_cluster_feats_3d(static_dataset_path=os.path.join(DATA_PROCESSED, 'static_cluster1.pkl'), output_path=os.path.join(DATA_PROCESSED, 'static_cluster3d.pkl'), save=True): """ Adds the features created by clustering for the selected 3D cases (age, income, memeber_since_epoch). The features to add are: 3d_kmeans_3, 3d_ward_3, 3d_ward_19, 3d_gmm_3, 3d_gmm_16, 3d_dbscan_02_20, 3d_dbscan_05_100 Args: static_dataset_path(str): The path to the static dataset to be taken as the initial data. output_path(str): The path to save the new dataset. save(boolean): Whether to save the new static dataset. Returns: static_cluster3d_dataset(dataframe): The same as the static dataset but with the features added into new columns. X_train_r(dataframe): X_train (as obtained from time-split with the input static data) with the new features. X_test_r(dataframe): X_test (as obtained from time-split with the input static data) with the new features. y_train(pd.Series): y_train as obtained from time-split with the input static data. y_test(pd.Series): y_test as obtained from time-split with the input static data. """ # Get the data X_train, X_test, y_train, y_test, encoder = sd.get_success_data( basic_dataset_path=static_dataset_path, drop_time=False, anon=False) # Encode and filter relevant features customer_feats = ['age', 'income', 'missing_demographics', 'member_epoch_days'] X_train_t = encoder.fit_transform(X_train) X_train_t = X_train_t[customer_feats] X_test_t = encoder.transform(X_test) X_test_t = X_test_t[customer_feats] # Drop duplicates and missing data X_train_t = X_train_t.dropna().drop_duplicates() X_test_t = X_test_t.dropna().drop_duplicates() # Keep a copy with the original demographics X_train_o = X_train_t.copy() X_test_o = X_test_t.copy() # Drop the irrelevant column X_train_t = X_train_t.drop('missing_demographics', axis=1) X_test_t = X_test_t.drop('missing_demographics', axis=1) # Normalize scaler = StandardScaler() scaler.fit(X_train_t) X_train_t = pd.DataFrame(scaler.transform(X_train_t), index=X_train_t.index, columns=X_train_t.columns) X_test_t = pd.DataFrame(scaler.transform(X_test_t), index=X_test_t.index, columns=X_test_t.columns) # Add the clustering labels # K-Means (k = 3) n_clusters = 3 kmeans = KMeans(n_clusters=n_clusters, random_state=2018) kmeans.fit(X_train_t) X_train_o['3d_kmeans_3'] = kmeans.predict(X_train_t) X_test_o['3d_kmeans_3'] = kmeans.predict(X_test_t) # Ward linkage_matrix = ward(X_train_t) # Ward 3 clusters n_clusters = 3 feat_name = '3d_ward_3' dist = DIST_3D_3 X_train_o[feat_name] = fcluster(linkage_matrix, dist, criterion='distance') # Use KNN to determine the test clusters knn_ward = KNeighborsClassifier(n_neighbors=5) knn_ward.fit(X_train_t, X_train_o[feat_name]) X_test_o[feat_name] = knn_ward.predict(X_test_t) # Ward 9 clusters n_clusters = 9 feat_name = '3d_ward_9' dist = DIST_3D_9 X_train_o[feat_name] = fcluster(linkage_matrix, dist, criterion='distance') # Use KNN to determine the test clusters knn_ward = KNeighborsClassifier(n_neighbors=5) knn_ward.fit(X_train_t, X_train_o[feat_name]) X_test_o[feat_name] = knn_ward.predict(X_test_t) # Ward 19 clusters n_clusters = 19 feat_name = '3d_ward_19' dist = DIST_3D_19 X_train_o[feat_name] = fcluster(linkage_matrix, dist, criterion='distance') # Use KNN to determine the test clusters knn_ward = KNeighborsClassifier(n_neighbors=5) knn_ward.fit(X_train_t, X_train_o[feat_name]) X_test_o[feat_name] = knn_ward.predict(X_test_t) # GMM 3 clusters gmm = GaussianMixture(n_components=3) gmm.fit(X_train_t) X_train_o['3d_gmm_3'] = gmm.predict(X_train_t) X_test_o['3d_gmm_3'] = gmm.predict(X_test_t) # GMM 16 clusters gmm = GaussianMixture(n_components=16) gmm.fit(X_train_t) X_train_o['3d_gmm_16'] = gmm.predict(X_train_t) X_test_o['3d_gmm_16'] = gmm.predict(X_test_t) # DBSCAN eps=0.2, min_samples=20 eps = 0.2 min_samples = 20 feat_name = '3d_dbscan_02_20' dbs = DBSCAN(eps=eps, min_samples=min_samples) dbs.fit(X_train_t) X_train_o[feat_name] = dbs.labels_ # Use KNN to determine the test clusters knn_dbscan = KNeighborsClassifier(n_neighbors=5) knn_dbscan.fit(X_train_t, X_train_o[feat_name]) X_test_o[feat_name] = knn_dbscan.predict(X_test_t) # DBSCAN eps=0.5, min_samples=100 eps = 0.5 min_samples = 100 feat_name = '3d_dbscan_05_100' dbs = DBSCAN(eps=eps, min_samples=min_samples) dbs.fit(X_train_t) X_train_o[feat_name] = dbs.labels_ # Use KNN to determine the test clusters knn_dbscan = KNeighborsClassifier(n_neighbors=5) knn_dbscan.fit(X_train_t, X_train_o[feat_name]) X_test_o[feat_name] = knn_dbscan.predict(X_test_t) # Merge with the original datsets X_train_r = X_train.merge(X_train_o, on=customer_feats, how='left') X_test_r = X_test.merge(X_test_o, on=customer_feats, how='left') # Join the new features with the old static dataset cluster_feats = ['3d_kmeans_3', '3d_ward_3', '3d_ward_9', '3d_ward_19', '3d_gmm_3', '3d_gmm_16', '3d_dbscan_02_20', '3d_dbscan_05_100'] static_cluster3d = pd.concat([X_train_r.sort_values(by='time'), X_test_r.sort_values(by='time')]) old_static = pd.read_pickle(static_dataset_path) id_feats = ['person', 'time', 'offer_id'] cluster_info = static_cluster3d[id_feats + cluster_feats] static_cluster3d_dataset = old_static.merge(cluster_info, on=id_feats) # Save the new static dataset if save: static_cluster3d_dataset.to_pickle(output_path) return static_cluster3d_dataset, X_train_r, X_test_r, y_train, y_test
def main(): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ static_dataset_path = os.path.join(DATA_INTERIM, 'static_data.pkl') static_cluster1_path = os.path.join(DATA_PROCESSED, 'static_cluster1.pkl') static_cluster3d_path = os.path.join(DATA_PROCESSED, 'static_cluster3d.pkl') static_lagged_path = os.path.join(DATA_PROCESSED, 'static_cluster_lagged.pkl') static_spent_10_days = os.path.join(DATA_PROCESSED, 'static_spent_10_days.pkl') logger = logging.getLogger(__name__) logger.info( 'Making the final datasets from raw data (the entire process can take about 1 hour, more or less, ' 'depending on the computational resources available)') # Load the raw data print('data raw is here:') print(os.path.join(DATA_RAW, 'portfolio.json')) portfolio = pd.read_json(os.path.join(DATA_RAW, 'portfolio.json'), orient='records', lines=True) profile = pd.read_json(os.path.join(DATA_RAW, 'profile.json'), orient='records', lines=True) transcript = pd.read_json(os.path.join(DATA_RAW, 'transcript.json'), orient='records', lines=True) # Initial preprocessing logger.info('Preprocessing...') data, portfolio = pp.basic_preprocessing(portfolio, profile, transcript) # Generate the static dataset, and save it logger.info('Generating the static dataset. ' + 'This may take several minutes...') static_data = pp.generate_static_dataset(data) static_data.to_pickle(static_dataset_path) # Add the 4D clustering features logger.info('Generating the 4D clustering features') clust.create_cluster_feats_4d(static_dataset_path=static_dataset_path, output_path=static_cluster1_path, save=True) # Add the 3D clustering features logger.info('Generating the 3D clustering features') clust.create_cluster_feats_3d(static_dataset_path=static_cluster1_path, output_path=static_cluster3d_path, save=True) # Add the lagged features logger.info('Generating the Lagged features') portfolio = pd.read_json(os.path.join(DATA_RAW, 'portfolio.json'), orient='records', lines=True) static_data = pd.read_pickle(static_cluster3d_path) data_lag = lag.fill_lagged_success(static_data, portfolio) data_lag.to_pickle(static_lagged_path) # Create the offer-success datasets and save them logger.info('Creating the offer-success datsets...') X_train_sd, \ X_test_sd, \ y_train_sd, \ y_test_sd, \ encoder_sd = sd.get_success_data(basic_dataset_path=static_lagged_path) X_train_sd.to_pickle(os.path.join(DATA_PROCESSED, 'X_train_success.pkl')) X_test_sd.to_pickle(os.path.join(DATA_PROCESSED, 'X_test_success.pkl')) y_train_sd.to_pickle(os.path.join(DATA_PROCESSED, 'y_train_success.pkl')) y_test_sd.to_pickle(os.path.join(DATA_PROCESSED, 'y_test_success.pkl')) with open(os.path.join(DATA_PROCESSED, 'encoder_success.pkl'), 'wb') as file: pickle.dump(encoder_sd, file) # Create spent-10-days static dataset logger.info('Creating the spent-10-days static datset') static_data = pd.read_pickle(static_lagged_path) filled = p10.get_spent_days_static(static_data, data) filled.to_pickle(static_spent_10_days) # Create the profit-10-days datasets and save them logger.info('Creating the profit-10-days datsets...') X_train_p10,\ X_test_p10,\ y_train_p10,\ y_test_p10,\ encoder_p10,\ view_cols_p10,\ profit_cols_p10 = p10.get_profit_10_days_data(basic_dataset_path=static_spent_10_days, fill_null=True, target=['viewed', 'profit_10_days'], drop_offer_id=False) X_train_p10.to_pickle(os.path.join(DATA_PROCESSED, 'X_train_profits.pkl')) X_test_p10.to_pickle(os.path.join(DATA_PROCESSED, 'X_test_profits.pkl')) y_train_p10.to_pickle(os.path.join(DATA_PROCESSED, 'y_train_profits.pkl')) y_test_p10.to_pickle(os.path.join(DATA_PROCESSED, 'y_test_profits.pkl')) with open(os.path.join(DATA_PROCESSED, 'encoder_profits.pkl'), 'wb') as file: pickle.dump(encoder_p10, file) with open(os.path.join(DATA_PROCESSED, 'view_cols_profits.pkl'), 'wb') as file: pickle.dump(view_cols_p10, file) with open(os.path.join(DATA_PROCESSED, 'profit_cols_profits.pkl'), 'wb') as file: pickle.dump(profit_cols_p10, file) logger.info('All the datasets were created successfully!')