def lesson_6(): # ------------------------- # Code from previous lesson # ------------------------- # Load data melbourne_data = pd.read_csv(melbourne_file_path) # Filter rows with missing price values filtered_melbourne_data = melbourne_data.dropna(axis=0) # Choose target and features y = filtered_melbourne_data.Price melbourne_features = [ 'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude' ] X = filtered_melbourne_data[melbourne_features] # Split data into training and validation data, train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) # ----------------- # Start of tutorial # ----------------- # Build a random forest model forest_model = RandomForestRegressor(random_state=1) forest_model.fit(train_X, train_y) melb_preds = forest_model.predict(val_X) print_("MAE for a Random Forest", 0) print_(mean_absolute_error(val_y, melb_preds))
def lesson_7(): print_("LESSON 7: Data Leakage", 0, 1) X, y = load_data_from_ex_7() # Since there is no preprocessing, we don't need a pipeline (used anyway as best practice!) my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100)) cv_scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy') print("Cross-validation accuracy: %f" % cv_scores.mean()) expenditures_cardholders = X.expenditure[y] expenditures_noncardholders = X.expenditure[~y] print('\nFraction of those who did not receive a card and had no expenditures: %.2f' \ % ((expenditures_noncardholders == 0).mean())) print('Fraction of those who received a card and had no expenditures: %.2f' \ % ((expenditures_cardholders == 0).mean())) # We would run a model without target leakage as follows: # Drop leaky predictors from dataset potential_leaks = ['expenditure', 'share', 'active', 'majorcards'] X2 = X.drop(potential_leaks, axis=1) # Evaluate the model with leaky predictors removed cv_scores = cross_val_score(my_pipeline, X2, y, cv=5, scoring='accuracy') print("\nCross-val accuracy: %f" % cv_scores.mean())
def ex_5(): print_("Exercise 5: Distributions", 0, 1) # --------------------- # Step 1: Load the data # --------------------- # Fill in the line below to read the (benign) file cancer_b_data = pd.read_csv(cancer_b_filepath, index_col="Id") # Fill in the line below to read the (malignant) file cancer_m_data = pd.read_csv(cancer_m_filepath, index_col="Id") # ----------------------- # Step 2: Review the data # ----------------------- # Print the first five rows of the (benign) data print_("First 5 rows of the benign data", 0) print_(cancer_b_data.head()) # Print the first five rows of the (malignant) data print_("First 5 rows of the malignant data", 0) print_(cancer_m_data.head()) # --------------------------------- # Step 3: Investigating differences # --------------------------------- # Part A # Histograms for benign and maligant tumors sns.distplot(a=cancer_b_data['Area (mean)'], label="Benign tumors", kde=False) sns.distplot(a=cancer_m_data['Area (mean)'], label="Malignant tumors", kde=False) plt.legend() plt.show() # ---------------------------- # Step 4: A very useful column # ---------------------------- # Part A # KDE plots for benign and malignant tumors sns.kdeplot(data=cancer_b_data['Radius (worst)'], label="Benign Tumors", shade=True) sns.kdeplot(data=cancer_m_data['Radius (worst)'], label="Malignant tumors", shade=True) # Add title plt.title( "Distribution in values for 'Radius (worst)', for both benign and malignant tumors" ) # Force legend to appear plt.legend() plt.show()
def lesson_6(): print_("LESSON 6: XGBoost", 0, 1) X_train, X_valid, y_train, y_valid = load_data_for_lesson_6() my_model = XGBRegressor() my_model.fit(X_train, y_train) predictions = my_model.predict(X_valid) print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))
def ex_6(): print_("Exercise 6: XGBoost", 0, 1) X_train, X_valid, y_train, y_valid, X_test = load_data_for_ex_6() # ------------------- # Step 1: Build model # ------------------- # Part A # Define the model my_model_1 = XGBRegressor(random_state=0) # Fit the model my_model_1.fit(X_train, y_train) # Part B # Get predictions predictions_1 = my_model_1.predict(X_valid) # Part C # Calculate MAE mae_1 = mean_absolute_error(predictions_1, y_valid) print("Mean Absolute Error:", mae_1) # ------------------------- # Step 2: Improve the model # ------------------------- my_model_2 = XGBRegressor(random_state=0, n_estimators=1000, learning_rate=0.05, n_jobs=4) # Fit the model my_model_2.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False) # Get predictions predictions_2 = my_model_2.predict(X_valid) # Calculate MAE mae_2 = mean_absolute_error(predictions_2, y_valid) print("Mean Absolute Error:" , mae_2) # ----------------------- # Step 3: Break the model # ----------------------- my_model_3 = XGBRegressor(random_state=0, n_estimators=10, learning_rate=0.9) # Fit the model my_model_3.fit(X_train, y_train) # Get predictions predictions_3 = my_model_3.predict(X_valid) # Calculate MAE mae_3 = mean_absolute_error(predictions_3, y_valid) print("Mean Absolute Error:", mae_3)
def lesson_5(): print_("LESSON 5: Cross-Validation", 0, 1) X, y = load_data_for_lesson_5() my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer( )), ('model', RandomForestRegressor(n_estimators=50, random_state=0))]) # Multiply by -1 since sklearn calculates *negative* MAE scores = -1 * cross_val_score( my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error') print("MAE scores:\n", scores) print("\nAverage MAE score (across experiments):") print(scores.mean(), end="\n\n")
def ex_1(): print_("Exercise 1: Hello, Seaborn", 0, 1) # --------------------- # Step 2: Load the data # --------------------- fifa_data = pd.read_csv(fifa_filepath, index_col="Date", parse_dates=True) # --------------------- # Step 3: Plot the data # --------------------- # Set the width and height of the figure plt.figure(figsize=(16, 6)) # Line chart showing how FIFA rankings evolved over time sns.lineplot(data=fifa_data) plt.show()
def load_data_from_ex_7(): # Read the data data = pd.read_csv(credit_card_file_path, true_values=['yes'], false_values=['no']) # Select target y = data.card # Select predictors X = data.drop(['card'], axis=1) print("Number of rows in the dataset:", X.shape[0]) print() print_("First 5 rows from X", 0) print_(X.head()) return X, y
def ex_6(): print_("Exercise 6: Choosing Plot Types and Custom Styles", 0, 1) spotify_data = pd.read_csv(spotify_filepath, index_col="Date", parse_dates=True) # ---------------------- # Try out seaborn styles # ---------------------- # Change the style of the figure sns.set_style("white") # Line chart plt.figure(figsize=(12, 6)) sns.lineplot(data=spotify_data) plt.show()
def lesson_6(): print_("Lesson 6: Choosing Plot Types and Custom Styles", 0, 1) spotify_data = pd.read_csv(spotify_filepath, index_col="Date", parse_dates=True) # Line chart plt.figure(figsize=(12, 6)) sns.lineplot(data=spotify_data) plt.show() # Seaborn has five different themes: (1)"darkgrid", (2)"whitegrid", # (3)"dark", (4)"white", and (5)"ticks" # Change the style of the figure to the "dark" theme sns.set_style("dark") # Line chart plt.figure(figsize=(12, 6)) sns.lineplot(data=spotify_data) plt.show()
def lesson_4(): print_("LESSON 4: Pipelines", 0, 1) # ------- # Example # ------- X_train, X_valid, y_train, y_valid, numerical_cols, categorical_cols = load_data_for_lesson_4( ) print_( "First 5 rows from the train data", 0, ) print_(X_train.head()) # Build pipeline in 3 steps # ---------------------------------- # Step 1: Define Preprocessing Steps # ---------------------------------- # The code below: # # - imputes missing values in numerical data, and # - imputes missing values and applies a one-hot encoding to categorical data. # Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='constant') # Preprocessing for categorical data categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) # Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer(transformers=[( 'num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)]) # ------------------------ # Step 2: Define the Model # ------------------------ model = RandomForestRegressor(n_estimators=100, random_state=0) # ---------------------------------------- # Step 3: Create and Evaluate the Pipeline # ---------------------------------------- # Define a pipeline that bundles the preprocessing and modeling steps # Bundle preprocessing and modeling code in a pipeline my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) # Preprocessing of training data, fit model my_pipeline.fit(X_train, y_train) # Preprocessing of validation data, get predictions preds = my_pipeline.predict(X_valid) # Evaluate the model score = mean_absolute_error(y_valid, preds) print('MAE:', score)
def ex_1(): print_("Exercise 1: Introduction", 0, 1) # Read the data X_full = pd.read_csv(train_file_path, index_col='Id') X_test_full = pd.read_csv(test_file_path, index_col='Id') # Obtain target and predictors y = X_full.SalePrice features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'] X = X_full[features].copy() X_test = X_test_full[features].copy() # Break off validation set from training data X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) print_("First 5 rows from the train dataset", 0) print_(X_train.head()) # ------------------------------- # Step 1: Evaluate several models # ------------------------------- # Define five different random forest models model_1 = RandomForestRegressor(n_estimators=50, random_state=0) model_2 = RandomForestRegressor(n_estimators=100, random_state=0) model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0) model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0) model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0) models = [model_1, model_2, model_3, model_4, model_5] # Function for comparing different models def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid): model.fit(X_t, y_t) preds = model.predict(X_v) return mean_absolute_error(y_v, preds) for i in range(0, len(models)): mae = score_model(models[i]) print("Model %d MAE: %d" % (i + 1, mae)) # Fill in the best model best_model = model_3 # --------------------------------- # Step 2: Generate test predictions # --------------------------------- # Create a Random Forest model my_model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0) # Fit the model to the training data my_model.fit(X, y) # Generate test predictions preds_test = my_model.predict(X_test) # Save predictions in format used for competition scoring output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test}) output.to_csv('ex1_submission.csv', index=False)
def lesson_3(): print_("Lesson 3: Bar Charts and Heatmaps", 0, 1) # ------------- # Load the data # ------------- flight_data = pd.read_csv(flight_filepath, index_col="Month") # ---------------- # Examine the data # ---------------- print_("The whole data", 0) print_(flight_data) # --------- # Bar chart # --------- # Set the width and height of the figure plt.figure(figsize=(10, 6)) # Add title plt.title("Average Arrival Delay for Spirit Airlines Flights, by Month") # Bar chart showing average arrival delay for Spirit Airlines flights by month sns.barplot(x=flight_data.index, y=flight_data['NK']) # Add label for vertical axis plt.ylabel("Arrival delay (in minutes)") plt.show() # Important: You must select the indexing column with flight_data.index, # and it is not possible to use flight_data['Month'] (which will return an # error). This is because when we loaded the dataset, the "Month" column # was used to index the rows. We always have to use this special notation # to select the indexing column. # ------- # Heatmap # ------- # Set the width and height of the figure plt.figure(figsize=(14, 7)) # Add title plt.title("Average Arrival Delay for Each Airline, by Month") # Heatmap showing average arrival delay for each airline by month # NOTE: annot=True - This ensures that the values for each cell appear on # the chart. (Leaving this out removes the numbers from each of the cells!) sns.heatmap(data=flight_data, annot=True) # Add label for horizontal axis plt.xlabel("Airline") plt.show()
def titanic(): # Load Titanic train dataset train_data = pd.read_csv(titanic_train_file_path) print_("First 5 rows from Titanic train dataset", 0) print_(train_data.head()) # Load test set test_data = pd.read_csv(titanic_test_file_path) print_("First 5 rows from Titanic test dataset", 0) print_(test_data.head()) # Part 3: Improve your score # Explore a pattern: assume that all female passengers survived (and all # male passengers died) women = train_data.loc[train_data.Sex == 'female']["Survived"] rate_women = sum(women) / len(women) # Based on the train set print("% of women who survived:", rate_women) men = train_data.loc[train_data.Sex == 'male']["Survived"] rate_men = sum(men) / len(men) # Based on the train set print("% of men who survived:", rate_men) # Your first machine learning model: a random forest model y = train_data["Survived"] features = ["Pclass", "Sex", "SibSp", "Parch"] X = pd.get_dummies(train_data[features]) X_test = pd.get_dummies(test_data[features]) model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1) model.fit(X, y) predictions = model.predict(X_test) output = pd.DataFrame({ 'PassengerId': test_data.PassengerId, 'Survived': predictions }) output.to_csv('my_submission.csv', index=False) print("Your submission was successfully saved!")
def lesson_4(): # Load data melbourne_data = pd.read_csv(melbourne_file_path) # Filter rows with missing price values filtered_melbourne_data = melbourne_data.dropna(axis=0) # Choose target and features y = filtered_melbourne_data.Price melbourne_features = [ 'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude' ] X = filtered_melbourne_data[melbourne_features] # Define model melbourne_model = DecisionTreeRegressor() # Fit model melbourne_model.fit(X, y) # Calculate the mean absolute error predicted_home_prices = melbourne_model.predict(X) mae = mean_absolute_error(y, predicted_home_prices) print_("Mean absolute error when using just train set", 0) print_(mae) # Split data into training and validation data, for both features and target # The split is based on a random number generator. Supplying a numeric value to # the random_state argument guarantees we get the same split every time we # run this script. train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) # Define model melbourne_model = DecisionTreeRegressor() # Fit model melbourne_model.fit(train_X, train_y) # Get predicted prices on validation data val_predictions = melbourne_model.predict(val_X) print_("Mean absolute error when using train and validation sets", 0) print_(mean_absolute_error(val_y, val_predictions))
def ex_5(): print_("Exercise 5: Cross-validation", 0, 1) X, y, X_test = load_data_for_ex_5() print_("First 5 rows from X", 0) print_(X.head()) my_pipeline = Pipeline(steps=[ ('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators=50, random_state=0)) ]) # Multiply by -1 since sklearn calculates *negative* MAE scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error') print("Average MAE score:", scores.mean()) # ------------------------------- # Step 1: Write a useful function # ------------------------------- def get_score(n_estimators): """Return the average MAE over 3 CV folds of random forest model. Keyword argument: n_estimators -- the number of trees in the forest """ my_pipeline_ = Pipeline(steps=[ ('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=0)) ]) # Multiply by -1 since sklearn calculates *negative* MAE scores = -1 * cross_val_score(my_pipeline_, X, y, cv=3, scoring='neg_mean_absolute_error') # print("\nAverage MAE score (across experiments):") # print(scores.mean(), end="\n\n") return scores.mean() # --------------------------------------- # Step 2: Test different parameter values # --------------------------------------- results = dict([(i, get_score(i)) for i in range(50, 300, 50)]) plt.plot(list(results.keys()), list(results.values())) plt.show()
def lesson_4(): print_("Lesson 4: Scatter Plots", 0, 1) # ------------------------- # Load and examine the data # ------------------------- insurance_data = pd.read_csv(insurance_filepath) print_("First 5 rows", 0) print_(insurance_data.head()) # ------------- # Scatter plots # ------------- sns.scatterplot(x=insurance_data['bmi'], y=insurance_data['charges']) plt.show() # Draw a line that best fits the data sns.regplot(x=insurance_data['bmi'], y=insurance_data['charges']) plt.show() # ------------------------- # Color-coded scatter plots # ------------------------- # Use color-coded scatter plots to display the relationships between 3 # variables sns.scatterplot(x=insurance_data['bmi'], y=insurance_data['charges'], hue=insurance_data['smoker']) plt.show() # sns.lmplot: adds two regression lines sns.lmplot(x="bmi", y="charges", hue="smoker", data=insurance_data) plt.show() # ------------------------ # Categorical scatter plot # ------------------------ # We use this sort of scatter plot to highlight the relationship between a # continuous and categorical variables sns.swarmplot(x=insurance_data['smoker'], y=insurance_data['charges']) plt.show()
def lesson_1(): print_("Lesson 1: Hello, Seaborn", 0, 1) # ------------- # Load the data # ------------- fifa_data = pd.read_csv(fifa_filepath, index_col="Date", parse_dates=True) # ---------------- # Examine the data # ---------------- print_("The first 5 rows of the data", 0) print_(fifa_data.head()) # ------------- # Plot the data # ------------- # Set the width and height of the figure plt.figure(figsize=(16, 6)) # Line chart showing how FIFA rankings evolved over time sns.lineplot(data=fifa_data) plt.show()
def lessons_1_to_3(): # Load Melbourne Housing Snapshot dataset melbourne_data = pd.read_csv(melbourne_file_path) # Print a summary of the data in Melbourne data print_("Summary of dataset", 0) print_(melbourne_data.describe()) # List of all columns in the dataset print_("Columns", 0) print_(melbourne_data.columns) # Drop missing values melbourne_data = melbourne_data.dropna(axis=0) # Select the prediction target (price) y = melbourne_data.Price print_("y", 0) print_(y) # Select features melbourne_features = [ 'Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude' ] X = melbourne_data[melbourne_features] print_("X", 0) print_(X) print_("Summary of X", 0) print_(X.describe()) print_("First few rows of X", 0) print_(X.head()) # Define model. Specify a number for random_state to ensure same results each run melbourne_model = DecisionTreeRegressor(random_state=1) # Fit model melbourne_model.fit(X, y) print_("Making predictions for the following 5 houses:", 0) print_(X.head()) print_("The predictions are", 0) print_(melbourne_model.predict(X.head()))
def lesson_2(): print_("Lesson 2: Categorical Encodings", 0, 1) ks = pd.read_csv(ks_projects_file_path, parse_dates=['deadline', 'launched']) # Drop live projects ks = ks.query('state != "live"') # Add outcome column, "successful" == 1, others are 0 ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int)) # Timestamp features ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year) # Label encoding cat_features = ['category', 'currency', 'country'] encoder = LabelEncoder() encoded = ks[cat_features].apply(encoder.fit_transform) data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome'] data = ks[data_cols].join(encoded) # Defining functions that will help us test our encodings def get_data_splits(dataframe, valid_fraction=0.1): valid_fraction = 0.1 valid_size = int(len(dataframe) * valid_fraction) train = dataframe[:-valid_size * 2] # valid size == test size, last two sections of the data valid = dataframe[-valid_size * 2:-valid_size] test = dataframe[-valid_size:] return train, valid, test def train_model(train, valid): feature_cols = train.columns.drop('outcome') dtrain = lgb.Dataset(train[feature_cols], label=train['outcome']) dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome']) param = { 'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7, 'verbose': -1 } bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False) valid_pred = bst.predict(valid[feature_cols]) valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred) print(f"Validation AUC score: {valid_score:.4f}") # Train a model (on the baseline data) train, valid, test = get_data_splits(data) print_("Baseline (LightGBM with no categorical encoding)", 0) train_model(train, valid) print() # -------------- # Count Encoding # -------------- cat_features = ['category', 'currency', 'country'] # Create the encoder count_enc = ce.CountEncoder() # Transform the features, rename the columns with the _count suffix, and join to dataframe # TODO: calculating the counts on the whole dataset? Should it be on the train only to avoid data leakage? # This is what was done in the Exercise 2 count_encoded = count_enc.fit_transform(ks[cat_features]) data = data.join(count_encoded.add_suffix("_count")) # Train a model train, valid, test = get_data_splits(data) print_("LightGBM with COUNT encoding", 0) train_model(train, valid) print() # --------------- # Target Encoding # --------------- # Create the encoder target_enc = ce.TargetEncoder(cols=cat_features) target_enc.fit(train[cat_features], train['outcome']) # Transform the features, rename the columns with _target suffix, and join to dataframe train_TE = train.join( target_enc.transform(train[cat_features]).add_suffix('_target')) valid_TE = valid.join( target_enc.transform(valid[cat_features]).add_suffix('_target')) # Train a model print_("LightGBM with TARGET encoding", 0) train_model(train_TE, valid_TE) print() # ----------------- # CatBoost Encoding # ----------------- # Create the encoder cb_enc = ce.TargetEncoder(cols=cat_features) cb_enc.fit(train[cat_features], train['outcome']) # Transform the features, rename the columns with _target suffix, and join to dataframe train_CBE = train.join( cb_enc.transform(train[cat_features]).add_suffix('_cb')) valid_CBE = valid.join( cb_enc.transform(valid[cat_features]).add_suffix('_cb')) # Train a model print_("LightGBM with CatBoost encoding", 0) train_model(train_CBE, valid_CBE) print()
def lesson_4(): print_("Lesson 4: Feature Selection", 0, 1) ks = pd.read_csv(ks_projects_file_path, parse_dates=['deadline', 'launched']) # Drop live projects ks = ks.query('state != "live"') # Add outcome column, "successful" == 1, others are 0 ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int)) # Timestamp features ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year) # Label encoding cat_features = ['category', 'currency', 'country'] encoder = LabelEncoder() encoded = ks[cat_features].apply(encoder.fit_transform) data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome'] baseline_data = ks[data_cols].join(encoded) cat_features = ['category', 'currency', 'country'] interactions = pd.DataFrame(index=ks.index) for col1, col2 in itertools.combinations(cat_features, 2): new_col_name = '_'.join([col1, col2]) # Convert to strings and combine new_values = ks[col1].map(str) + "_" + ks[col2].map(str) label_enc = LabelEncoder() interactions[new_col_name] = label_enc.fit_transform(new_values) baseline_data = baseline_data.join(interactions) launched = pd.Series(ks.index, index=ks.launched, name="count_7_days").sort_index() count_7_days = launched.rolling('7d').count() - 1 count_7_days.index = launched.values count_7_days = count_7_days.reindex(ks.index) baseline_data = baseline_data.join(count_7_days) def time_since_last_project(series): # Return the time in hours return series.diff().dt.total_seconds() / 3600. df = ks[['category', 'launched']].sort_values('launched') timedeltas = df.groupby('category').transform(time_since_last_project) timedeltas = timedeltas.fillna(timedeltas.max()) baseline_data = baseline_data.join( timedeltas.rename({'launched': 'time_since_last_project'}, axis=1)) def get_data_splits(dataframe, valid_fraction=0.1): valid_fraction = 0.1 valid_size = int(len(dataframe) * valid_fraction) train = dataframe[:-valid_size * 2] # valid size == test size, last two sections of the data valid = dataframe[-valid_size * 2:-valid_size] test = dataframe[-valid_size:] return train, valid, test def train_model(train, valid): feature_cols = train.columns.drop('outcome') dtrain = lgb.Dataset(train[feature_cols], label=train['outcome']) dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome']) param = { 'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7 } print("Training model!") bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False) valid_pred = bst.predict(valid[feature_cols]) valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred) print(f"Validation AUC score: {valid_score:.4f}") return bst # ---------------------------- # Univariate Feature Selection # ---------------------------- feature_cols = baseline_data.columns.drop('outcome') # Keep 5 features selector = SelectKBest(f_classif, k=5) # NOTE: we should select features using only a training set, not the whole # dataset we are doing here (which will be fixed next) X_new = selector.fit_transform(baseline_data[feature_cols], baseline_data['outcome']) print_("X_new (after selecting 5 best features)", 0) print_(X_new) # Fix: select features using only a training set feature_cols = baseline_data.columns.drop('outcome') train, valid, _ = get_data_splits(baseline_data) # Keep 5 features selector = SelectKBest(f_classif, k=5) X_new = selector.fit_transform(train[feature_cols], train['outcome']) print_("X_new FIXED [Using Train Only]", 0) print_(X_new) # Get back the features we've kept, zero out all other features selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) print_( "First 5 rows from the train set including the 5 best features only (others set at 0)", 0) print_(selected_features.head()) # Dropped columns have values of all 0s, so var is 0, drop them selected_columns = selected_features.columns[selected_features.var() != 0] # Get the valid dataset with the selected features. print_("Valid dataset with the selected features only", 0) print_(valid[selected_columns].head()) # ----------------- # L1 regularization # ----------------- train, valid, _ = get_data_splits(baseline_data) X, y = train[train.columns.drop("outcome")], train['outcome'] # Set the regularization parameter C=1 logistic = LogisticRegression(C=1, penalty="l1", solver='liblinear', random_state=7).fit(X, y) model = SelectFromModel(logistic, prefit=True) X_new = model.transform(X) print_("X_new with L1 regularization", 0) print_(X_new) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index, columns=X.columns) # Dropped columns have values of all 0s, keep other columns selected_columns = selected_features.columns[selected_features.var() != 0] print_("Rejected columns: {}".format( selected_features.columns.difference(selected_columns).to_list())) # Get the valid dataset with the selected features. print_("Valid dataset with the selected features using L1 regularization", 0) print_(valid[selected_columns].head())
def lesson_1(): print_("Lesson 1: Baseline Model", 0, 1) ks = pd.read_csv(ks_projects_file_path, parse_dates=['deadline', 'launched']) print_("First 6 rows from the Kickstarter Projects dataset", 0) print_(ks.head(6)) print('Unique values in `state` column:', list(ks.state.unique())) # Prepare the target column # Drop live projects ks = ks.query('state != "live"') # Add outcome column, "successful" == 1, others are 0 ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int)) # Convert timestamps ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year) # Prep categorical variables cat_features = ['category', 'currency', 'country'] encoder = LabelEncoder() # Apply the label encoder to each column encoded = ks[cat_features].apply(encoder.fit_transform) # Collect all of these features in a new dataframe that we can use to train # a model # # Since ks and encoded have the same index and I can easily join them data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded) data.head() # Create training, validation, and test splits # Use 10% of the data as a validation set, 10% for testing, and the other # 80% for training. valid_fraction = 0.1 valid_size = int(len(data) * valid_fraction) train = data[:-2 * valid_size] valid = data[-2 * valid_size:-valid_size] test = data[-valid_size:] # Train a model feature_cols = train.columns.drop('outcome') dtrain = lgb.Dataset(train[feature_cols], label=train['outcome']) dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome']) param = {'num_leaves': 64, 'objective': 'binary'} param['metric'] = 'auc' num_round = 1000 bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False) # Make predictions & evaluate the model ypred = bst.predict(test[feature_cols]) score = metrics.roc_auc_score(test['outcome'], ypred) print(f"Test AUC score: {score}")
def lesson_3(): print_("Lesson 3: Feature Generation", 0, 1) # ----- # Setup # ----- ks = pd.read_csv(ks_projects_file_path, parse_dates=['deadline', 'launched']) # Drop live projects ks = ks.query('state != "live"') # Add outcome column, "successful" == 1, others are 0 ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int)) # Timestamp features ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year) # Label encoding cat_features = ['category', 'currency', 'country'] encoder = LabelEncoder() encoded = ks[cat_features].apply(encoder.fit_transform) data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome'] baseline_data = ks[data_cols].join(encoded) # ------------ # Interactions # ------------ interactions = ks['category'] + "_" + ks['country'] print_("Interactions: first 5 rows from category_country", 0) print_(interactions.head(5)) # Label encode the interaction feature and add it to the data label_enc = LabelEncoder() data_interaction = baseline_data.assign( category_country=label_enc.fit_transform(interactions)) print_("First 5 rows from data with the added interactions", 0) print_(data_interaction.head()) # ----------------------------------- # Number of projects in the last week # ----------------------------------- # First, create a Series with a timestamp index launched = pd.Series(ks.index, index=ks.launched, name="count_7_days").sort_index() print_("First 20 rows from series with the timestamp index", 0) print_(launched.head(20)) count_7_days = launched.rolling('7d').count() - 1 print_("First 20 rows from the rolling window of 7 days", 0) print_(count_7_days.head(20)) # Ignore records with broken launch dates plt.plot(count_7_days[7:]) plt.title("Number of projects launched over periods of 7 days") plt.show() # Adjust the index so we can join it with the other training data. count_7_days.index = launched.values count_7_days = count_7_days.reindex(ks.index) print_( "First 10 rows from the rolling window of 7 days (with index adjusted)", 0) print_(count_7_days.head(10)) # Now join the new feature with the other data again using .join since # we've matched the index. print_( "First 10 rows from baseline data with the new feature (count_7_days))" ) print_(baseline_data.join(count_7_days).head(10)) # ------------------------------------------------ # Time since the last project in the same category # ------------------------------------------------ def time_since_last_project(series): # Return the time in hours return series.diff().dt.total_seconds() / 3600. df = ks[['category', 'launched']].sort_values('launched') timedeltas = df.groupby('category').transform(time_since_last_project) print_( "First 20 rows from timedeltas (time since the last project in " "the same category)", 0) print_(timedeltas.head(20)) # We get NaNs here for projects that are the first in their category. # Fix NaNs by using the mean or median. We'll also need to reset the index # so we can join it with the other data. # Final time since last project timedeltas = timedeltas.fillna(timedeltas.median()).reindex( baseline_data.index) print_("First 20 rows from timedeltas (with NaNs fixed)", 0) print_(timedeltas.head(20)) # ------------------------------- # Transforming numerical features # ------------------------------- # Some models work better when the features are normally distributed # Transform them with the square root or natural logarithm. # Example: transform the goal feature using the square root and log functions # Square root transformation plt.hist(np.sqrt(ks.goal), range=(0, 400), bins=50) plt.title('Sqrt(Goal)') plt.show() # Log function transformation plt.hist(np.log(ks.goal), range=(0, 25), bins=50) plt.title('Log(Goal)') plt.show()
def lesson_2(): print_("Lesson 2: Indexing, Selecting & Assigning", 0, 1) reviews = pd.read_csv(wine_file_path, index_col=0) # ---------------- # Native accessors # ---------------- print_("Country column from reviews", 0) print_(reviews.country) # also reviews['country'] print_("First country from the country Series", 0) print_(reviews.country[0]) # ------------------ # Indexing in pandas # ------------------ # pandas' own accessor operators: loc and iloc # # NOTE: loc and iloc are row-first, column-second # This is the opposite of what we do in native Python, which is # column-first, row-second. # Index-based selection: iloc # NOTE 1: iloc requires numeric indexers, # NOTE 2: iloc indexes exclusively # Select the first row of data in a DataFrame print_("First row of data", 0) print_(reviews.iloc[0]) print_("Get the first column from a DataFrame", 0) print_(reviews.iloc[:, 0]) print_("Get the first 3 rows from the country column", 0) print_(reviews.iloc[:3, 0]) print_("Get the 2nd and 3rd rows from the country column", 0) print_(reviews.iloc[1:3, 0]) print_("Get the first 3 rows from the country column using a list", 0) print_(reviews.iloc[[0, 1, 2], 0]) print_("Get the 5 last elements from the dataset", 0) print_(reviews.iloc[-5:]) # Label-based selection: loc # NOTE 1: loc works with string indexers, # NOTE 2: loc, meanwhile, indexes inclusively print_("Get the first entry in reviews (using loc)", 0) print_(reviews.loc[0, 'country']) print_("Get columns from the dataset using loc", 0) print_(reviews.loc[:, ['taster_name', 'taster_twitter_handle', 'points']]) # ---------------------- # Manipulating the index # ---------------------- print_("set_index to the title field", 0) print_(reviews.set_index("title")) # --------------------- # Conditional selection # --------------------- print_("Check if each wine is Italian or not", 0) print_(reviews.country == 'Italy') print_("Get italian wined", 0) print_(reviews.loc[reviews.country == 'Italy']) # AND: & print_("Get italian wines that are better than average", 0) print_(reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)]) # OR : | (pipe) print_("Get italian or better than average wines", 0) print_(reviews.loc[(reviews.country == 'Italy') | (reviews.points >= 90)]) # isin conditional selector print_("Get wines from Italy or France", 0) print_(reviews.loc[reviews.country.isin(['Italy', 'France'])]) # isnull and notnull: is (or not) empty (NaN) print_("Get wines with a price tag", 0) print_(reviews.loc[reviews.price.notnull()]) # -------------- # Assigning data # -------------- # Assign a constant value # Every row gets 'everyone' reviews['critic'] = 'everyone' print_("Assign a constant value", 0) print_(reviews['critic']) # Assign an iterable of values reviews['index_backwards'] = range(len(reviews), 0, -1) print_("Assign an iterable of values", 0) print_(reviews['index_backwards'])
def lesson_6(): # pd.set_option('max_rows', 5) print_("Lesson 6: Renaming and Combining", 0, 1) reviews = pd.read_csv(wine_file_path, index_col=0) # -------- # Renaming # -------- # rename(): lets you change index names and/or column names # Change column # Change the points column in our dataset to score print_("Change the points column to score", 0) print_(reviews.rename(columns={'points': 'score'})) # Change indexes print_("Rename some elements of the index", 0) print_(reviews.rename(index={0: 'firstEntry', 1: 'secondEntry'})) # IMPORTANT: set_index() is usually more convenient than using rename() # to change indexes # rename_axis(): change the names for the row index and the column index print_("Change the row index to wines and the column index to fields", 0) print_( reviews.rename_axis("wines", axis='rows').rename_axis("fields", axis='columns')) # --------- # Combining # --------- # Three core methods for combining DataFrames and Series (start less complex) # - concat() # - join() # - merge() # # NOTE: what merge() can do, join() can do it more simply # concat(): smush a given list of elements together along an axis # # Smush two datasets # Ref.: https://www.kaggle.com/datasnaek/youtube-new canadian_youtube = pd.read_csv( os.path.expanduser( "~/Data/kaggle_datasets/trending_youtube/CAvideos.csv")) british_youtube = pd.read_csv( os.path.expanduser( "~/Data/kaggle_datasets/trending_youtube/GBvideos.csv")) print_("Concat two datasets", 0) print_(pd.concat([canadian_youtube, british_youtube])) # join(): lets you combine different DataFrame objects which have an index # in common # # Pull down videos that happened to be trending on the same day in both # Canada and the UK print_( "videos that happened to be trending on the same day in both Canada " "and the UK", 0) left = canadian_youtube.set_index(['title', 'trending_date']) right = british_youtube.set_index(['title', 'trending_date']) print_(left.join(right, lsuffix='_CAN', rsuffix='_UK'))
def lesson_5(): # pd.set_option('max_rows', 5) print_("Lesson 5: Data Types and Missing Values", 0, 1) reviews = pd.read_csv(wine_file_path, index_col=0) # ------ # DTypes # ------ # column.dtype print_("dtype of the price column", 0) print_(reviews.price.dtype) # DataFrame.dtypes: dtypes of every column print_("dtypes of every column", 0) print_(reviews.dtypes) # object type: for strings # astype(): converts a column of one type into another print_("Convert points from int64 t float64", 0) print_(reviews.points.astype('float64')) # ------------ # Missing data # ------------ # NaN values are always of the float64 dtype # Select NaN entries print_("Select NaN entries for country", 0) print_(reviews[pd.isnull(reviews.country)]) # Replace missing values with fillna() print_("Replace missing values with Unknown", 0) print_(reviews.region_2.fillna("Unknown")) # Backfill strategy for filling missing values: fill each missing value # with the first non-null value that appears sometime after the given # record in the database. # Replace a non-null value: replace() print_("Replace @kerinokeefe to @kerino", 0) print_(reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino"))
def lesson_4(): pd.set_option("display.max_rows", 5) print_("Lesson 4: Grouping and Sorting", 0, 1) reviews = pd.read_csv(wine_file_path, index_col=0) # ------------------ # Groupwise analysis # ------------------ print_("Count occurrences of each point using group_by()", 0) print_(reviews.groupby('points').points.count()) # Equivalent to using value_counts() print_("Count occurrences of each point using value_counts()", 0) print_(reviews.points.value_counts().sort_index()) # Get the cheapest wine in each point value category print_("Cheapest wine in each point value category", 0) print_(reviews.groupby('points').price.min()) # Select the name of the first wine reviewed from each winery print_( "Select the name of the first wine reviewed from each winery using apply()", 0) print_(reviews.groupby('winery').apply(lambda df: df.title.iloc[0])) # You can also group by more than one column # Example: pick out the best wine by country and province: print_("Pick out the best wine by country and province", 0) print_( reviews.groupby(['country', 'province' ]).apply(lambda df: df.loc[df.points.idxmax()])) # agg(): lets you run a bunch of different functions on your DataFrame simultaneously # Example: generate a simple statistical summary of the dataset by country print_("Statistical summary by country", 0) print_(reviews.groupby(['country']).price.agg([len, min, max])) # ------------- # Multi-indexes # ------------- # vs single-level (regular) indices # More info about multi-indexes at https://pandas.pydata.org/pandas-docs/stable/advanced.html countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len]) print_("Multi-index: country and province", 0) print_(countries_reviewed) # reset_index(): important multi-index method that converts back to a # regular index print_("reset_index(): get back to the original single index", 0) print_(countries_reviewed.reset_index()) # ------- # Sorting # ------- countries_reviewed = countries_reviewed.reset_index() print_("Sort by 'len' (ascending)", 0) print_(countries_reviewed.sort_values(by='len')) print_("Sort by 'len' (descending)", 0) print_(countries_reviewed.sort_values(by='len', ascending=False)) # Sort by index values print_("Sort by index values", 0) print_(countries_reviewed.sort_index()) # Sort by more than one column at a time print_("Sort by 2 columns: country and len", 0) countries_reviewed.sort_values(by=['country', 'len'])
def lesson_1(): print_("Lesson 1: Creating, Reading and Writing", 0, 1) # ------------- # Creating data # ------------- # DataFrame dt_int = pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]}) print_("Simple DataFrame with integers", 0) print_(dt_int) dt_str = pd.DataFrame({ 'Bob': ['I liked it.', 'It was awful.'], 'Sue': ['Pretty good.', 'Bland.'] }) print_("Simple DataFrame with strings", 0) print_(dt_str) dt_index = pd.DataFrame( { 'Bob': ['I liked it.', 'It was awful.'], 'Sue': ['Pretty good.', 'Bland.'] }, index=['Product A', 'Product B']) print_("DataFrame with row labels", 0) print_(dt_index) # Series s_list = pd.Series([1, 2, 3, 4, 5]) print_("Simple series with integers", 0) print_(s_list) # NOTE: a Series does not have a column name, it only has one overall name s_index_name = pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A') print_("Series with row labels and a name", 0) print_(s_index_name) # --------------- # Read data files # --------------- wine_reviews = pd.read_csv(wine_file_path) print_("How large the Wine Reviews dataset is", 0) print("Shape: ", wine_reviews.shape) print("Number of entries: ", wine_reviews.shape[0] * wine_reviews.shape[1]) print() print_("First 5 rows from the Wine Reviews dataset", 0) print_(wine_reviews.head()) # Make pandas use the CSV's built-in index for the index (instead of # creating a new one from scratch) by specifying an index_col wine_reviews = pd.read_csv(wine_file_path, index_col=0) print_("First 5 rows from the Wine Reviews dataset [using index_col=0]") print_(wine_reviews.head())
def lesson_3(): print_("Lesson 3: Summary Functions and Maps", 0, 1) reviews = pd.read_csv(wine_file_path, index_col=0) print_("Reviews", 0) print_(reviews) # ----------------- # Summary functions # ----------------- # Describe with numerical data print_("Describe reviews.points (numerical data only)", 0) print_(reviews.points.describe()) # Describe with string data print_("Describe reviews.taster_name (string data)", 0) print_(reviews.taster_name.describe()) # Statistic: mean print_("Mean of reviews.points", 0) print_(reviews.points.mean()) # Unique values print_("Unique values from reviews.taster_name", 0) print_(reviews.taster_name.unique()) # Unique values and how often they occur in the dataset print_("Unique values and their counts from reviews.taster_name", 0) print_(reviews.taster_name.value_counts()) # ---- # Maps # ---- # Two important mapping methods: map() and apply() # NOTE: they don't modify the original data they're called on # map() # Remean the scores the wines received to 0 review_points_mean = reviews.points.mean() remeans = reviews.points.map(lambda p: p - review_points_mean) print_("Remean the wine scores to 0 using map()", 0) print_(remeans) # apply() # NOTE: apply() is way slower than map() def remean_points(row): row.points = row.points - review_points_mean return row # NOTE: if axis='index', we transform each column # Commented because too slow """ reviews_remeans = reviews.apply(remean_points, axis='columns') print_("Remean the wine scores to 0 using apply()", 0) print_(reviews_remeans.points) """ # Faster way to remeaning the points column review_points_mean = reviews.points.mean() remeans = reviews.points - review_points_mean print_("Remean the wine scores to 0 using .mean() [Faster]", 0) print_(remeans) # Combining columns comb_cols = reviews.country + " - " + reviews.region_1 print_("Combining country and region info", 0) print_(comb_cols)
def ex_2(): print_("Exercise 2: Line Charts", 0, 1) # --------------------- # Step 1: Load the data # --------------------- museum_data = pd.read_csv(museum_filepath, index_col="Date", parse_dates=True) # ----------------------- # Step 2: Review the data # ----------------------- # Print the last five rows of the data print_("Last 5 rows", 0) print_(museum_data.tail()) # How many visitors did the Chinese American Museum receive in July 2018? ca_museum_jul18 = museum_data.loc['2018-07-01', 'Chinese American Museum'] print_( "Number of visitor the Chinese American Museum receive in July 2018", 0) print_(ca_museum_jul18) # In October 2018, how many more visitors did Avila # Adobe receive than the Firehouse Museum? subset = museum_data.loc['2018-10-01', ['Avila Adobe', 'Firehouse Museum']] avila_oct18 = subset[0] - subset[1] print_( "Number of visitors Avila Adobe received more than the Firehouse Museum (October 2018)", 0) print_(avila_oct18) # --------------------------------- # Step 3: Convince the museum board # --------------------------------- # Set the width and height of the figure plt.figure(figsize=(14, 6)) # Add title plt.title("Monthly visitors for 4 museums in LA") # Line chart showing number of visitors to each museum over time sns.lineplot(data=museum_data) plt.show() # -------------------------- # Step 4: Assess seasonality # -------------------------- # Part A # Line plot showing the number of visitors to Avila Adobe over time # Set the width and height of the figure plt.figure(figsize=(14, 6)) # Add title plt.title("Monthly visitors to Avila Adobe museum") # Line chart showing number of visitors to Avila Adobe over time sns.lineplot(data=museum_data['Avila Adobe'], label="Avila Adobe") # Add label for horizontal axis plt.xlabel("Date") plt.show()