コード例 #1
0
def remove_columns(df):
    """Removes columns not needed for prediction.
    
    Args:
        df (pandas dataframe obj): Pandas dataframe 

    Returns:
        pandas dataframe obj: Pandas dataframe with only shipper, std_weight, zone, 
            sender_state, recipient_state, distance, sender_pop, sender_pop_density, sender_houses,
            recipient_pop, recipient_pop_density, recipient_houses, same_msa,
            sender_in_msa, rec_in_msa, week_number, day_of_week, month, Y columns
            
    """
    print("Removing unnecessary columns...")
    print(f"Starting with {report_df_stats(df)}\nRemoving columns...")
    start_time = time.time()
    columns_kept = [
        'shipper', 'std_weight', 'zone', 'sender_state', 'recipient_state',
        'distance', 'sender_pop', 'sender_pop_density', 'sender_houses',
        'recipient_pop', 'recipient_pop_density', 'recipient_houses',
        'same_msa', 'sender_in_msa', 'rec_in_msa', 'week_number',
        'day_of_week', 'month', 'Y'
    ]
    df = df[columns_kept]
    nan_rows = df[df.isnull().T.any().T]
    nan_rows.to_csv('null.csv')
    print(f"Ending with {report_df_stats(df)}.")
    utilities.print_elapsed_time(start_time)
    return df
コード例 #2
0
def add_features(df):
    """Combines other functions to add features to dataframe.

    Args:
        df (pandas dataframe obj): Pandas dataframe that must contain sender_zip, recipient_zip,
            shipment_date, weight, package_count columns.

    Returns:
        pandas dataframe obj: Pandas dataframe with new columns sender_pop, sender_pop_density, sender_houses,
            recipient_pop, recipient_pop_density, recipient_houses, week_number, day_of_week, month, std_weight,
            distance, sender_in_msa, rec_in_msa, same_msa.
    """
    print("Adding features...")
    print(f"Starting with {report_df_stats(df)}")
    start_time = time.time()
    # Create std_weight (weight/package count)
    df['std_weight'] = df['weight'] / df['package_count']
    # Add date time features based on shipment date
    print("Adding datetime features based on shipment date...")
    df['week_number'] = df['shipment_date'].dt.week
    df['day_of_week'] = df['shipment_date'].dt.dayofweek
    df['month'] = df['shipment_date'].dt.month
    # Add distance between sender and recipient zips
    print("Adding distance...")
    df['distance'] = get_distance(df['sender_zip'].values,
                                  df['recipient_zip'].values)
    # Add sender_in_MSA, rec_in_MSA, same_MSA bools
    df = add_MSA_features(df)
    # Add population, population density, no. housing units
    add_zip_details(df)
    print(f"Ending with {report_df_stats(df)}.")
    utilities.print_elapsed_time(start_time)
    return df
コード例 #3
0
def split_scale_data(X, y):
    '''Split into training and test datasets, and scale data

    Args:
        X (pandas dataframe obj): Pandas dataframe with dummified categorical features and other features.
        y (pandas dataframe obj): Pandas dataframe with target variable i.e. time window number.

    Returns:
        Dictionary of numpy arrays:
            X_train: the matrix of training data
            y_train: the array of training labels
            X_test: the matrix of testing data
            y_test: the array of testing labels
    '''
    # Split data
    print("Splitting and scaling data...")
    print("Splitting data...")
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        test_size=0.2,
                                                        random_state=71)
    # Scale the variables
    print("Scaling data...")
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # Save scaler for use in prediction
    print("Saving scaler...")
    scaler_path = os.path.join(paths.model_scaler_dir,
                               "scaler_" + model_id + ".pkl.z")
    joblib.dump(scaler, scaler_path)
    print("Scaler saved in", scaler_path)
    # return training and testing data
    datadict = {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }
    # Save data dictionary file
    print("\nSaving data dictionary...")
    datadict_path = os.path.join(paths.data_delivery_prediction_datadict_dir,
                                 "datadict_" + model_id + ".npz")
    np.savez(datadict_path,
             X_train=datadict['X_train'],
             y_train=datadict['y_train'],
             X_test=datadict['X_test'],
             y_test=datadict['y_test'])
    print("Data dictionary saved in", datadict_path)
    # Print sizes
    print("\nTraining and testing dataset sizes")
    print("X_train", X_train.shape, "y_train", y_train.shape)
    print("X_test", X_test.shape, "y_test", y_test.shape)
    utilities.print_elapsed_time(start_time)
    return datadict
コード例 #4
0
def one_hot_encode(df):
    """Categorize and apply one-hot-encoding to all categorical columns.

    Args:
        df (pandas dataframe obj): Pandas dataframe with only columns used for prediction

    Returns:
        X (pandas dataframe obj): Pandas dataframe with dummified categorical features and other features.
        y (pandas dataframe obj): Pandas dataframe with target variable i.e. time window number.
    """
    print("Categorizing columns...")
    print(f"Starting with {report_df_stats(df)}")
    start_time = time.time()
    cat_cols = [
        'shipper', 'zone', 'sender_state', 'recipient_state', 'week_number',
        'day_of_week', 'month'
    ]

    float_cols = [
        'std_weight', 'distance', 'sender_pop', 'sender_pop_density',
        'sender_houses', 'recipient_pop', 'recipient_pop_density',
        'recipient_houses'
    ]

    df[cat_cols] = df[cat_cols].astype('category')
    df[float_cols] = df[float_cols].astype('float64')

    print("Number of nulls in each column:")
    print(df.isnull().sum())
    print("\nReplacing nulls with mean/mode...")
    df[float_cols] = df[float_cols].fillna(df[float_cols].mean())
    df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode())
    print("Number of nulls in each column after replacement:")
    print(df.isnull().sum(), "\n")

    print("Applying one hot encoding on categorical columns...")
    y = df.Y
    df = df.drop(columns=['Y'])
    # Record feature names for current model
    feature_names = {'feature_names': df.columns.values}
    X = pd.get_dummies(df)
    feature_names['feature_names_dummified'] = X.columns.values
    print("\nFeature names:", feature_names['feature_names_dummified'])
    # Save feature names for use in prediction
    features_path = os.path.join(paths.data_delivery_prediction_features_dir,
                                 "feature_names_" + model_id + ".npz")
    np.savez(features_path,
             feature_names=feature_names['feature_names'],
             feature_names_dummified=feature_names['feature_names_dummified'])
    print(f"Feature names stored in {features_path}\n")
    print(f"Ending with {report_df_stats(X)}.")
    utilities.print_elapsed_time(start_time)
    return X, y
コード例 #5
0
def add_time_windows(df):
    """Adds time windows as target variable, based on delivery date and time.

    Calculates number of business days (excl weekends, shipment date, federal holidays) for delivery.
    Adds delivery time to number of business days to get time-in-transit.
    Creates window thresholds based on discussion with Jose.
    Lastly assign time windows to shipments based on time-in-transit.

    Args:
        df (pandas dataframe obj): Pandas dataframe that must contain shipment_date, delivery_date, delivery_time.

    Returns:
        pandas dataframe obj: Pandas dataframe with new columns days_in_transit, days_taken_float (time-in-transit),
            Y (target variable i.e. time window)

    """
    print("Adding time windows i.e. target variable...")
    print(f"Starting with {report_df_stats(df)}")
    start_time = time.time()
    # Calculate days in transit (exclude shipment date, holidays, weekends)
    start_date = df['shipment_date'].min()
    end_date = df['shipment_date'].max()
    calendar = USFederalHolidayCalendar()
    holidays = calendar.holidays(start_date, end_date).date.tolist()
    shipment_dates = [d.date() for d in df['shipment_date']]
    delivery_dates = [d.date() for d in df['delivery_date']]
    # -1 because we will add transit time
    df['days_in_transit'] = np.busday_count(
        shipment_dates, delivery_dates, holidays=holidays) - 1
    # Convert days in transit/delivery time to days taken (with decimals))
    # e.g. if parcel reaches at 12.00pm on 2nd business day, days taken is 1.5
    delivery_percentage_of_day = [
        (timedelta.total_seconds(d) / timedelta(days=1).total_seconds())
        for d in df['delivery_time']
    ]
    df['days_taken_float'] = df['days_in_transit'] + delivery_percentage_of_day
    # Keep rows from -1 to 5 days in transit. The rest are rare occurrences.
    max_days_to_keep = 5
    df = df[df['days_in_transit'].isin(np.arange(-1, max_days_to_keep))]

    # Assign time windows
    time_window_thresholds = create_time_window_thresholds()
    tqdm.pandas(desc="Assign time window")
    df['Y'] = df.progress_apply(lambda x: assign_time_window(
        x['days_taken_float'], time_window_thresholds),
                                axis=1)

    print(f"Ending with {report_df_stats(df)}.")
    utilities.print_elapsed_time(start_time)
    return df
コード例 #6
0
def format_cost_savings(pred, pred_proba, rates_df):
    """Formats dataframe for predict one output

    Args:
        pred (list): List of predicted ground delivery time window number based on
            mode of predicted probability distribution.
        pred_proba (2D array): Predicted probability distribution across time windows.
            Shape = (no. time windows, no. of test shipments).
        rates_df (pandas dataframe obj): Dataframe with service type, ship cost, ground cost, cost savings,
            scheduled window numbers, scheduled time windows as columns

    Returns:
        Pandas dataframe object: Dataframe with service type, ship cost, ground cost, cost savings,
            scheduled time windows, predicted time windows, cumulative probability as columns
    """
    start_time = time.time()
    print("Calculating cost savings and probabilities...")
    # Load dict that maps time window numbers to time windows
    windows_cmu = joblib.load(paths.windows_cmu)
    # Insert predicted time window into dataframe
    # windows_cmu maps time window number to corresponding time window
    try:
        rates_df['pred_ground_window'] = windows_cmu[pred]
    except TypeError:
        print("Shipper did not return any results")
        pass
    # Insert predicted probability distribution array object into each cell in column
    rates_df['pred_ground_window_pdf'] = 0
    rates_df['pred_ground_window_pdf'] = rates_df[
        'pred_ground_window_pdf'].astype(object)
    for i in range(len(rates_df)):
        rates_df.at[i, 'pred_ground_window_pdf'] = pred_proba
    # For each probability distribution array object, slice the array with scheduled window number
    # Sum of the sliced array is the predicted cumulative probability of shipment arriving before or in scheduled window
    rates_df['pred_probability'] = rates_df.apply(lambda x: np.sum(x[
        'pred_ground_window_pdf'][:x['scheduled_window_no'] + 1]),
                                                  axis=1)
    # Converts fraction into percentage for display
    rates_df['pred_probability'] = pd.Series(
        ["{0:.2f}%".format(val * 100) for val in rates_df['pred_probability']],
        index=rates_df.index)
    # Drop columns that are not needed for display
    rates_df = rates_df.drop(
        columns=['pred_ground_window_pdf', 'scheduled_window_no'])
    utilities.print_elapsed_time(start_time)
    print(" ")
    return rates_df
コード例 #7
0
def full_query(start_year_month, end_year_month, frac):
    """Extracts a fraction of raw shipping records from the database without batch method.

    Does not query by batch. Faster but requires more RAM.

    Args:
        start_year_month (str): Start date in YYYY-MM format.
        end_year_month (str): End date in YYYY-MM format.\
        frac (str): String representation of fraction of data to extract.

    Returns:
        pandas dataframe obj: Dataframe with batch query records.
    """

    # Establishes connection to a MySQL db
    print(f"Connecting to {credentials.db}...")
    start_time = time.time()
    db = MySQLdb.connect(credentials.host, credentials.user,
                         credentials.password, credentials.db)
    utilities.print_elapsed_time(start_time)

    # instantiates batch start and end dates as the int of the concatenated string of year + month
    print("Extracting records...")
    extraction_start_time = time.time()
    # Append day 1 to year and month to create datetime object. Day does not affect result
    start_date = datetime.strptime(start_year_month + "-1", "%Y-%m-%d").date()
    end = datetime.strptime(end_year_month + "-1", "%Y-%m-%d").date()
    month = end.month
    year = end.year
    last_date_of_month = calendar.monthrange(year, month)[1]
    end_date = f"{year}-{month}-{last_date_of_month}"
    records = pd.DataFrame()
    # Set warning for chained assignment to None.
    pd.options.mode.chained_assignment = None
    # Calculate number of months between start and end year/month
    results = query(db, start_date, end_date, frac)
    print(f"{len(results)} records extracted.")
    utilities.print_elapsed_time(extraction_start_time)
    if len(results) > 0:
        num_batches = 100
        chunk_size = int(results.shape[0] / num_batches)
        print(f"Cleaning {len(results)} records...")
        for start in trange(0, results.shape[0], chunk_size):
            df_subset = results.iloc[start:start + chunk_size]
            records = records.append(preprocess(df_subset), ignore_index=True)
    print(f"Returning {len(records)} records.\n")
    return records
コード例 #8
0
def store(records, frac):
    """Stores results from each query into a compressed pickle file.

    Args:
        records (pandas dataframe obj): Dataframe after general preprocessing.
        frac (str): String representation of fraction of data to extract.

    Returns:
        str: Output file path

    """
    print("Saving records...")
    start_time = time.time()
    filename = create_filename(frac)
    output_path = os.path.join(paths.data_extracted_dir, filename + ".pkl.z")
    joblib.dump(records, output_path)
    print(f"Data extracted and stored in {output_path}")
    utilities.print_elapsed_time(start_time)
    return output_path
コード例 #9
0
def predict_time_windows(test, model):
    """Predicts time window probability distribution with saved model for test input.

    Args:
        test (array): Input for model prediction.
        model (obj): Loaded model for prediction.


    Returns:
        pred (list): List of predicted ground delivery time window number based on
            mode of predicted probability distribution.
        pred_proba (2D array): Predicted probability distribution across time windows.
            Shape = (no. time windows, no. of test shipments).
    """
    start_time = time.time()
    print("Loading trained model and predicting time windows...")
    pred = model.predict(test)
    pred_proba = model.predict_proba(test)
    utilities.print_elapsed_time(start_time)
    return pred, pred_proba
コード例 #10
0
def remove_rows(df):
    """Removes unnecessary rows from dataframe.

    Args:
        df (pandas dataframe obj): Pandas dataframe

    Returns:
        pandas dataframe obj: Pandas dataframe with rows removed.
    """
    print("Removing unnecessary rows...")
    print(f"Starting with {report_df_stats(df)}")
    start_time = time.time()
    # Keep Ground and Home Delivery since both are day definite
    print(
        "Removing non-ground service types since we are only predicting for ground deliveries..."
    )
    services_kept = ["Ground", "Home Delivery"]
    df = df[df['std_service_type'].isin(services_kept)]
    # Keep rows with delivery time
    print("Removing rows without delivery time...")
    df = df.dropna(subset=['delivery_time'])
    # Remove rows that appear too often with abnormal delivery time = 23:59:59 and 00:12:00
    print("Removing rows with anomalous delivery time 23:59:59, 00.12.00...")
    df = df[~df['delivery_time'].isin(['23:59:59'])]
    df = df[~df['delivery_time'].isin(['00:12:00'])]
    # Trim empty spaces / Replace empty strings with NA / Remove NA
    print("Removing rows with malformed zipcodes...")
    df['recipient_zip'] = df['recipient_zip'].str.strip()
    df['recipient_zip'].replace('', np.nan, inplace=True)
    df['sender_zip'] = df['sender_zip'].str.strip()
    df['sender_zip'].replace('', np.nan, inplace=True)
    df.dropna(subset=['recipient_zip', 'sender_zip'], inplace=True)
    # Remove zipcodes with alphabets
    df = df[df['recipient_zip'].apply(lambda x: x.isnumeric())]
    df = df[df['sender_zip'].apply(lambda x: x.isnumeric())]
    ## Remove zipcodes if length != 5
    df = df[df['recipient_zip'].apply(lambda x: len(str(x)) == 5)]
    df = df[df['sender_zip'].apply(lambda x: len(str(x)) == 5)]
    print(f"Ending with {report_df_stats(df)}.")
    utilities.print_elapsed_time(start_time)
    return df
コード例 #11
0
def format_batch_results(pred, pred_proba, df):
    """Formats dataframe for output.

    Args:
        pred (int): Predicted ground delivery time window no. based on mode of predicted probability distribution.
        pred_proba (np array object): Predicted probability distribution across time windows.
            Shape of (no. time windows, no. of test shipments).
        df (pandas dataframe obj): Pandas dataframe

    Returns:
        Pandas dataframe object: Dataframe with scheduled time windows, predicted time windows, cumulative probability
            as columns.
    """
    print("Formatting batch results...")
    start_time = time.time()
    df_features = df.copy(deep=False)
    # Load dict that maps time window numbers to time windows
    windows_cmu = joblib.load(paths.windows_cmu)
    # Insert predicted time window into dataframe
    # windows_cmu maps time window number to corresponding time window
    df['scheduled_window'] = df.apply(
        lambda x: windows_cmu[x['scheduled_window_no']], axis=1)
    # Insert predicted probability distribution array object into each cell in column
    df['pred_ground_window_pdf'] = 0
    df['pred_ground_window_pdf'] = df['pred_ground_window_pdf'].astype(object)
    for i in range(len(df)):
        df.at[i, 'pred_ground_window'] = windows_cmu[pred[i]]
        df.at[i, 'pred_ground_window_pdf'] = pred_proba[i]
    # For each probability distribution array object, slice the array with scheduled window number
    # Sum of the sliced array is the predicted cumulative probability of shipment arriving before or in scheduled window
    df['prob_arrive_by_scheduled_window'] = df.apply(lambda x: np.sum(x[
        'pred_ground_window_pdf'][:x['scheduled_window_no'] + 1]),
                                                     axis=1)
    # Get cumulative probability for all time windows for each shipment
    col_to_format = ['prob_arrive_by_scheduled_window']
    for i in list(windows_cmu.keys()):
        col_name = "prob_arrive_by_window_" + str(i)
        df[col_name] = df.apply(
            lambda x: np.sum(x['pred_ground_window_pdf'][:i + 1]), axis=1)
        col_to_format.append(col_name)
    # Converts fraction into percentage for display
    df[col_to_format] = df[col_to_format].astype('float64')
    df[col_to_format] = df[col_to_format].values * 100
    df[col_to_format] = df[col_to_format].applymap("{0:.2f}%".format)
    # Drop columns that are not needed for prediction display
    df = df.drop(columns=[
        'pred_ground_window_pdf', 'scheduled_window_no', 'week_number',
        'day_of_week', 'month', 'distance', 'sender_in_msa', 'rec_in_msa',
        'same_msa', 'sender_pop', 'sender_pop_density', 'sender_houses',
        'sender_state', 'recipient_pop', 'recipient_pop_density',
        'recipient_houses', 'recipient_state'
    ])
    # Create df just for display
    df_display = df.copy(deep=False)
    col_to_format.pop(0)
    df_display = df_display.drop(columns=col_to_format)
    print(
        tabulate(df_display,
                 headers='keys',
                 showindex=False,
                 floatfmt=".2f",
                 tablefmt='psql'))
    utilities.print_elapsed_time(start_time)
    # Save to xlsx format
    timestamp = utilities.get_timestamp()
    output_path = os.path.join(paths.output_delivery_prediction_dir,
                               timestamp + "_predict_batch.xlsx")
    print("\nSaving results...")
    with pd.ExcelWriter(output_path) as writer:
        df.to_excel(writer, sheet_name='Predicted Window Probability')
        df_features.to_excel(writer, sheet_name='Features')
    print("Results saved to " + output_path + "\n")
    return df
コード例 #12
0
def preprocess_batch(df, feature_names, scaler):
    """Combines all preprocessing functions to preprocess data for prediction.

    Args:
        df (pandas dataframe obj): Pandas dataframe after validation passed
        feature_names (npz object): Dictionary of numpy arrays containing feature names.
        scaler (obj): Scaler corresponding to selected model.

    Returns:
        df (pandas dataframe obj): Preprocessed pandas dataframe with new features.
        X_test (array): Test data to be used for prediction.
    """
    print("Preprocessing batch...")
    start_time = time.time()
    print("Adding datetime features")
    # Get date time features
    df['shipment_date'] = pd.to_datetime(df['shipment_date'])
    df['week_number'] = df['shipment_date'].dt.week
    df['day_of_week'] = df['shipment_date'].dt.dayofweek
    df['month'] = df['shipment_date'].dt.month
    print("Adding distance...")
    # Get distance
    df['distance'] = delivery_prediction_preprocess.get_distance(
        df['sender_zip'].values, df['recipient_zip'].values)
    # Get MSA details
    df = delivery_prediction_preprocess.add_MSA_features(df)
    # Get zip details
    df = add_zip_details(df)
    # Add scheduled time window
    print("Adding scheduled time windows")
    df = add_scheduled_windows(df)

    # Decide on columns to keep
    columns_kept = [
        'shipper', 'weight', 'zone', 'sender_state', 'recipient_state',
        'distance', 'sender_pop', 'sender_pop_density', 'sender_houses',
        'recipient_pop', 'recipient_pop_density', 'recipient_houses',
        'same_msa', 'sender_in_msa', 'rec_in_msa', 'week_number',
        'day_of_week', 'month'
    ]

    predict_df = df.copy(deep=False)
    predict_df = predict_df[columns_kept]

    predict_df = predict_df.fillna(0)

    cat_cols = [
        'shipper', 'zone', 'week_number', 'day_of_week', 'sender_state',
        'recipient_state', 'month'
    ]

    float_cols = [
        'weight', 'distance', 'sender_pop', 'sender_pop_density', 'same_msa',
        'sender_in_msa', 'rec_in_msa', 'sender_houses', 'recipient_pop',
        'recipient_pop_density', 'recipient_houses'
    ]

    predict_df[cat_cols] = predict_df[cat_cols].astype('category')
    predict_df[float_cols] = predict_df[float_cols].astype('float64')

    print("One-hot-encoding features...")
    # Dummify dataframe
    predict_df = pd.get_dummies(predict_df)
    # Create empty dataframe in same shape as the one used in model, fill with 0s
    df_full = pd.DataFrame(columns=feature_names['feature_names_dummified'])
    # Execute a right join to align our test dataframe with full dataframe
    predict_df, df_full = predict_df.align(df_full,
                                           join='right',
                                           axis=1,
                                           fill_value=0)
    # Convert dataframe to numpy array for prediction
    X_test = predict_df.values
    print("Scaling data with saved scaler...")
    # Scale data with saved min-max scaler
    X_test = scaler.transform(X_test)
    utilities.print_elapsed_time(start_time)
    return df, X_test
コード例 #13
0
def preprocess_one(shipment_date, shipper, std_weight, sender_zip,
                   recipient_zip, scaler, feature_names):
    """Preprocesses input to create features that model will predict on.

    Args:
        shipment_date (str): Shipment date in YYYY-MM-DD format
        shipper (str): Shipper name. only accepts FedEx or UPS.
        std_weight (float): Shipment weight
        sender_zip (str): Sender 5-digit zipcode
        recipient_zip (str): Recipient 5-digit zipcode
        scaler (obj): Scaler based on model
        feature_names (npz object): Dictionary of numpy arrays that contain feature names

    Returns:
        test (np array object): Input for model prediction.
        rates_df (pandas dataframe obj): Dataframe with service type, ship cost, ground cost, cost savings,
            scheduled window numbers, scheduled time windows as columns
    """
    print("Preprocessing input...")
    start_time = time.time()
    # Get datetime features
    week_number, day_of_week, month = get_date_details(shipment_date)
    # Get sender_in_msa and recipient_in_MSA and same_msa booleans
    sender_in_msa, rec_in_msa, same_msa = get_msa_details(
        sender_zip, recipient_zip)
    # Get distance
    distance = round(get_distance(sender_zip, recipient_zip), 5)
    # Get population, density, no. houses, state code for recipient and sender
    search = SearchEngine()
    recipient_pop, recipient_pop_density, recipient_houses, recipient_state = get_zip_details(
        recipient_zip, search)
    sender_pop, sender_pop_density, sender_houses, sender_state = get_zip_details(
        sender_zip, search)
    # Get rates dataframe and zone
    rates_df, zone = get_shippo_details(shipper, std_weight, sender_zip,
                                        sender_state, recipient_zip,
                                        recipient_state)
    if rates_df is None:
        print('Shippo Api Error Ocurred: Please Try Again')
        os.system('python main.py')

    # Create empty dataframe with correct columns that model was trained on
    df = pd.DataFrame(columns=feature_names['feature_names'])
    # Add new row into df with test data
    df.loc[0] = [
        shipper, std_weight, zone, sender_state, recipient_state, distance,
        sender_pop, sender_pop_density, sender_houses, recipient_pop,
        recipient_pop_density, recipient_houses, same_msa, sender_in_msa,
        rec_in_msa, week_number, day_of_week, month
    ]
    # Define categorical and float columns for one-hot-encoding purposes
    cat_cols = [
        'shipper', 'zone', 'week_number', 'day_of_week', 'sender_state',
        'recipient_state', 'month'
    ]
    float_cols = [
        'std_weight', 'distance', 'sender_pop', 'sender_pop_density',
        'sender_houses', 'recipient_pop', 'recipient_pop_density',
        'recipient_houses'
    ]
    df[cat_cols] = df[cat_cols].astype('category')
    df[float_cols] = df[float_cols].astype('float64')
    # Create one-hot-encoded variables from categorical columns
    df = pd.get_dummies(df)
    # Create empty dataframe in same shape as the one used in model, fill with 0s
    df_full = pd.DataFrame(columns=feature_names['feature_names_dummified'])
    # Execute a right join to align test dataframe with dataframe that model was trained on
    df, df_full = df.align(df_full, join='right', axis=1, fill_value=0)
    # Convert dataframe to numpy array for prediction
    test = df.loc[0].values
    # Scale data with saved min-max scaler
    test = test.reshape(1, -1)
    test = scaler.transform(test)
    utilities.print_elapsed_time(start_time)
    return test, rates_df
コード例 #14
0
def train(datadict, model_id, n_estimators=25, max_depth=50):
    """Random forest model training.

    Trains random forest model. Only n_estimators, max_depth hyperparameters are available to the user for training.
    The rest of the hyperparameters have been tuned by the CMU team.
    Saves model and statistics after training.

    Args:
        datadict (dict): Dictionary of numpy arrays containing preprocessed train and test data.
        model_id (str): Timestamp used to identify model, scaler and feature names files.
        n_estimators (str): Number of trees in forest. Less likely to overfit with more trees.
        max_depth (str): The maximum depth of the tree. More likely to overfit if depth is large.

    """
    # Convert n_estimators and max_depth from string to int since model only accepts int
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    print("\nTraining model...")
    start_time = time.time()
    X_train = datadict['X_train']
    y_train = datadict['y_train']
    X_test = datadict['X_test']
    y_test = datadict['y_test']
    model_rf = RandomForestClassifier(n_estimators=n_estimators,
                                      max_depth=max_depth,
                                      verbose=1,
                                      n_jobs=-1,
                                      bootstrap=False)
    print("\nFitting model...")
    print("Parameters used:", model_rf.get_params())
    model_rf.fit(X_train, y_train)
    print("\nPredicting results...")
    y_pred_rf = model_rf.predict(X_test)
    # y_pred_proba_rf = model_rf.predict_proba(X_test)
    print("\nCalculating accuracy...")
    accuracy_df = get_accuracy_windows(1, y_test, y_pred_rf)
    accuracy = accuracy_score(y_test, y_pred_rf) * 100
    # Save model
    print("\nSaving model...")
    model_path = os.path.join(
        paths.model_dir,
        "acc-" + f"{accuracy:.2f}" + "-model_" + model_id + ".pkl.z")
    print(f"Model saved in {model_path}")
    joblib.dump(model_rf, model_path)
    # Get model stats
    feature_importance_df = get_feature_importance(model_rf, model_id)
    classification_report_df = get_classification_report(y_test, y_pred_rf)
    params_df = get_params(model_rf)
    # Save stats to excel
    print("\nSaving model stats...")
    stats_path = os.path.join(
        paths.output_delivery_prediction_stats_dir,
        "acc-" + f"{accuracy:.2f}" + "-stats_" + model_id + ".xlsx")
    print(f"Stats saved in {stats_path}")
    with pd.ExcelWriter(stats_path) as writer:
        accuracy_df.to_excel(writer, sheet_name='Accuracy')
        feature_importance_df.to_excel(writer, sheet_name='Feature Importance')
        classification_report_df.to_excel(writer,
                                          sheet_name='Classification Report')
        params_df.to_excel(writer, sheet_name='Model Parameters')

    utilities.print_elapsed_time(start_time)
コード例 #15
0
def batch_query(start_year_month, end_year_month, frac):
    """Extracts a fraction of raw shipping records from the database with batch method.

    Args:
        start_year_month (str): Start date in YYYY-MM format.
        end_year_month (str): End date in YYYY-MM format.\
        frac (str): String representation of fraction of data to extract.

    Returns:
        pandas dataframe obj: Dataframe with batch query records.
    """

    # Establishes connection to a MySQL db
    print(f"Connecting to {credentials.db}...")
    start_time = time.time()
    db = MySQLdb.connect(credentials.host, credentials.user,
                         credentials.password, credentials.db)
    utilities.print_elapsed_time(start_time)

    # instantiates batch start and end dates as the int of the concatenated string of year + month
    print("Extracting and preprocessing records...")
    extraction_start_time = time.time()
    # Append day 1 to year and month to create datetime object. Day does not affect result
    start = datetime.strptime(start_year_month + "-1", "%Y-%m-%d").date()
    end = datetime.strptime(end_year_month + "-1", "%Y-%m-%d").date()
    records = pd.DataFrame()
    # Set warning for chained assignment to None.
    pd.options.mode.chained_assignment = None
    # Calculate number of months between start and end year/month
    delta = relativedelta.relativedelta(end, start)
    num_batches = delta.years * 12 + delta.months + 1
    month = start.month
    year = start.year
    # tqdm progress bar reference: https://github.com/tqdm/tqdm
    pbar = trange(num_batches)
    for i in pbar:
        pbar.set_description(f"Querying {year} {calendar.month_abbr[month]}")
        # First day of month in batch
        first_date_of_month = 1
        start_date = f"{year}-{month}-{first_date_of_month}"
        # Last day of month in batch
        last_date_of_month = calendar.monthrange(year, month)[1]
        end_date = f"{year}-{month}-{last_date_of_month}"
        # Query from first day to last day of given month
        results = query(db, start_date, end_date, frac)
        pbar.set_description(
            f"Preprocessing {len(results)} records for {year} {calendar.month_abbr[month]}"
        )
        # Only preprocess if batch has records
        if len(results) > 0:
            records = records.append(preprocess(results), ignore_index=True)
        # If current month is 12, increment year by 1 and reset month to 1 for next batch
        if month == 12:
            month = 1
            year += 1
        # Else, increment month by 1
        else:
            month += 1
    print(f"{len(records)} records extracted and preprocessed")
    utilities.print_elapsed_time(extraction_start_time)
    return records