Esempio n. 1
0
def make_predictions(data):
	# if os.path.exists('models/{0}_{1}.sav'.format(project_name, standard_service)):
	# 	# TODO Load model and predict
	# 	return

	new_data = pd.DataFrame(index=range(0,len(data)),columns=['Date', 'Total Count'])
	for i in range(0, len(data)):
		new_data['Date'][i] = data.index[i]
		new_data['Total Count'][i] = data['Total Count'][i]
	add_datepart(new_data, 'Date')
	new_data.drop(['Dayofyear', 'Dayofweek', 'Elapsed', 'Is_quarter_end', 'Week', 'Is_month_end', 'Is_month_start', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'], axis=1, inplace=True)

	train_size = len(new_data)
	train = new_data[:train_size]
	x_train = train.drop('Total Count', axis=1)
	y_train = train['Total Count']

	model = LinearRegression()
	model.fit(x_train, y_train)
	
	# TODO Save model

	test = pd.date_range(data.index[-1], data.index[-1] + timedelta(days=365), freq='D')
	test = pd.DataFrame(test, columns=['Date'])
	test_date = pd.date_range(data.index[-1], data.index[-1] + timedelta(days=365), freq='D')
	test_date = pd.DataFrame(test, columns=['Date'])
	add_datepart(test, 'Date')
	test.drop(['Dayofyear', 'Dayofweek', 'Elapsed', 'Is_quarter_end', 'Week', 'Is_month_end', 'Is_month_start', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'], axis=1, inplace=True)

	predictions = model.predict(test)
	test['Predictions'] = predictions
	test.index = test_date.Date
	
	return test
def preprocess(inp_df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the dataframe for modeling. The data, along with the data
    from the gather_args() function will get passed to either the training or
    prediction method.

    Inputs: a raw dataframe
    Output: a processed dataframe to pass to .train() or .get_preds()
    """

    df = inp_df.copy()

    # Sort by date since we have a timeseries
    df.sort_values(by=['date', 'store'], inplace=True)

    # Drop week_start and day_of_week since add_datepart() will do that
    df.drop('week_start', axis='columns', inplace=True)
    df.drop('day_of_week', axis='columns', inplace=True)

    # If our whole df has sales == 0, it must be a single-row df used for a
    # single prediction, so just take the first row
    if (df.sales == 0).all():
        df = df.iloc[0]
    else:
        # Drop any sales == 0 since they'll mess up rmspe (div by zero)
        df = df[df.sales != 0]

    tabular.add_datepart(df, 'date', drop=True, time=False)

    return df
def LinearRegression(df, type, split):
    #creating dataframe with date and the target variable
    data = df.sort_index(ascending=True, axis=0)
    new_data = pd.DataFrame(index=range(0, len(df)), columns=['Date', type])

    for i in range(0, len(data)):
        new_data['Date'][i] = data['Date'][i]
        new_data[type][i] = data[type][i]

    #create features
    from fastai.tabular import add_datepart
    add_datepart(new_data, 'Date')
    new_data.drop('Elapsed', axis=1,
                  inplace=True)  #elapsed will be the time stamp

    new_data['mon_fri'] = 0
    for i in range(0, len(new_data)):
        if (new_data['Dayofweek'][i] == 0
                or new_data['Dayofweek'][i] == 4):  #如果是星期一或星期五
            new_data['mon_fri'][i] = 1
        else:
            new_data['mon_fri'][i] = 0

    #split into train and validation
    train = new_data[:split]
    valid = new_data[split:]

    x_train = train.drop(type, axis=1)
    y_train = train[type]
    x_valid = valid.drop(type, axis=1)
    y_valid = valid[type]

    #implement linear regression
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(x_train, y_train)

    preds = model.predict(x_valid)
    rmse = np.sqrt(np.mean(np.power((np.array(y_valid) - np.array(preds)), 2)))
    st.write('RMSE value on validation set:')
    st.write(rmse)

    valid['Predictions'] = 0
    valid['Predictions'] = preds

    valid.index = new_data[split:].index
    train.index = new_data[:split].index

    append_data = DataFrame(data={type: [], 'Predictions': []})

    append_data[type] = train[type]
    append_data['Predictions'] = train[type]

    pic = pd.concat(
        [append_data[[type, 'Predictions']], valid[[type, 'Predictions']]],
        axis=0)

    st.line_chart(pic)
Esempio n. 4
0
def pred_single(date, prev, learn=infer):
  print(f'Getting predictions for date {date} with prev closing price of {prev}')
  df = pd.DataFrame(dict(Date=date, prev=prev), index=[0])
  add_datepart(df, 'Date')
  pred = learn.predict(df.iloc[0])
  print(pred)
  res = round(np.exp(pred[0].data.item()), 2)
  print(res)
  return res
Esempio n. 5
0
def MLPRegression(data, startAt, stopAt=None, **kwargs):
    """
    Applies the Multi-Layer Perceptron regression to data to forecast values
    between startAt and stopAt.
    If stopAt is not provided, forecast until the end of data.

    Parameters:
        data (pandas.DataFrame): Data returned by prepare_data (may be
                                 differentiated)
        startAt (int): Index where the forecast starts
        stopAt (int): Index where the forecast stops
            (default is None)
        **kwargs: Additionnal arguments for MLPRegressor Class

    Returns:
        predictions (list): The forecast from startAt up to stopAt
    """

    data_copy = data.copy()

    if (stopAt is None):
        stopAt = len(data_copy)

    periods = stopAt - startAt

    from fastai.tabular import add_datepart
    add_datepart(data_copy, 'Date')
    data_copy.drop('Elapsed', axis=1, inplace=True)

    # setting importance of days before and after weekends
    # we assume that fridays and mondays are more important
    # 0 is Monday, 1 is Tuesday...
    data_copy['mon_fri'] = 0
    data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]),
                              1,
                              inplace=True)
    data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]),
                               0,
                               inplace=True)

    train = data_copy[:startAt]
    valid = data_copy[startAt:stopAt]

    x_train = train.drop('Close', axis=1)
    y_train = train['Close']
    x_valid = valid.drop('Close', axis=1)
    y_valid = valid['Close']

    from sklearn.neural_network import MLPRegressor
    model = MLPRegressor(**kwargs)
    model.fit(x_train, y_train)

    predictions = model.predict(x_valid)

    return predictions
Esempio n. 6
0
def svm(data, startAt, stopAt=None):
    """
    Applies the Support Vector Machine to forecast data whose index is between
    startAt and stopAt.
    If stopAt is not provided, forecasts until the end of data.

    Parameters:
        data (pandas.DataFrame): Data returned by prepare_data (may be
                                 differentiated)
        startAt (int): Index where the forecast starts
        stopAt (int): Index where the forecast stops
            (default is None)

    Returns:
        predictions (list): The forecast from startAt up to stopAt
    """

    data_copy = data.copy()

    if (stopAt is None):
        stopAt = len(data_copy)

    periods = stopAt - startAt

    from fastai.tabular import add_datepart
    add_datepart(data_copy, 'Date')
    data_copy.drop('Elapsed', axis=1, inplace=True)

    # setting importance of days before and after weekends
    # we assume that fridays and mondays are more important
    # 0 is Monday, 1 is Tuesday...
    data_copy['mon_fri'] = 0
    data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]),
                              1,
                              inplace=True)
    data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]),
                               0,
                               inplace=True)

    train = data_copy[:startAt]
    valid = data_copy[startAt:stopAt]

    x_train = train.drop('Close', axis=1)
    y_train = train['Close']
    x_valid = valid.drop('Close', axis=1)
    y_valid = valid['Close']

    from sklearn import svm
    model = svm.SVR(gamma='scale', kernel='linear', degree=2, coef0=1)
    model.fit(x_train, y_train)

    predictions = model.predict(x_valid)

    return predictions
Esempio n. 7
0
def linear_regression(data, startAt, stopAt=None):
    """
    Applies the linear regression method to data to predict points whose index
    is between startAt and stopAt.
    If stopAt is not provided, default value is the length of data.

    Parameters:
        data (pandas.DataFrame): Data returned by prepare_data (may be
                                 differentiated)
        startAt (int): Index where the forecast starts
        stopAt (int): Index where the forecast stops
            (default is None)

    Returns:
        predictions (list): The forecast from startAt up to stopAt
    """

    data_copy = data.copy()

    if (stopAt is None):
        stopAt = len(data_copy)

    periods = stopAt - startAt

    from fastai.tabular import add_datepart
    add_datepart(data_copy, 'Date')
    data_copy.drop('Elapsed', axis=1, inplace=True)

    # setting importance of days before and after weekends
    # we assume that fridays and mondays are more important
    # 0 is Monday, 1 is Tuesday...
    data_copy['mon_fri'] = 0
    data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]),
                              1,
                              inplace=True)
    data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]),
                               0,
                               inplace=True)

    train = data_copy[:startAt]
    valid = data_copy[startAt:stopAt]

    x_train = train.drop('Close', axis=1)
    y_train = train['Close']
    x_valid = valid.drop('Close', axis=1)
    y_valid = valid['Close']

    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(x_train, y_train)  # calculating coefficients

    predictions = model.predict(x_valid)  # Applies the regression

    return predictions
Esempio n. 8
0
def ticker_svm(raw_data):
    data_copy = raw_data.copy()
    add_datepart(data_copy, 'date')
    data_copy.drop('Elapsed', axis=1, inplace=True)

    # setting importance of days before and after weekends
    # we assume that Fridays and Mondays are more important
    # 0 is Monday, 1 is Tuesday
    data_copy['mon_fri'] = 0
    data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]), 1, inplace=True)
    data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]), 0, inplace=True)

    x_features = data_copy.drop('close', axis=1)
    y_features = data_copy['close']
    return x_features, y_features
Esempio n. 9
0
    def _get_df_from_file(self):
        all_data_list = StockController.load_data_from_file(self.stock_symbol)
        df = pd.DataFrame(all_data_list).iloc[::-1]
        df["adj_close"] = df["adj_close"].astype(float)

        # add trend data
        df = df.set_index("date")
        df_trend = pd.read_csv(f"data/trends/{self.stock_symbol}.csv")
        df_merged = df.merge(df_trend.set_index("date"),
                             how="inner",
                             left_index=True,
                             right_index=True)
        df = df_merged.reset_index()[["date", "adj_close", "bmw stock"]]

        add_datepart(df, "date")
        return df
Esempio n. 10
0
def readRawData(relativeDataFolderPath, outputFileName="processedRawData.csv"):

    import os, sys, traceback

    import pandas as pd
    import glob

    from myUtilities import getParentFolder
    from myUtilities import createFolder

    from fastai.tabular import add_datepart

    # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath)
    # using various python commands like os.path.abspath and os.path.join
    jupyterNodePath = None

    # Variable to hold a dataframe created with the data from input data files in the relativeDataFolderPath provided
    inputRawDataDF = None

    # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath)
    # using various python commands like os.path.abspath and os.path.join
    dataFolderPath = None

    # Variable to hold query like value of python to query all json file names in the source folder (dataFolderPath).
    # Will be used in the glob function to execute the query
    json_pattern = None

    # Variable to contain the list of all input json file names in the source folder (dataFolderPath)
    file_list = None

    # return values of this method
    # -------------------------------------------------------------------------------
    # Current methods return value initialized to false. Will be maked as true
    # after every single line in the method has been executed with out errors
    returnValue = False
    # complete filepath of the csv file with the processed raw data
    output_file_name = None

    # -------------------------------------------------------------------------------
    try:
        #caluclate the deployment directory path of the current juypter node in the operating system
        jupyterNodePath = os.path.abspath(os.path.join('.'))

        # TO BE MODIFIED - NOT SURE WHY I USED THIS - WILL HAVE TO CHECK
        pd.set_option('display.max_columns', None)

        # creating pandas dataframe references for further modification
        inputRawDataDF = pd.DataFrame()

        #calculating the complete data folder path of the relative path provided as parameter
        dataFolderPath = jupyterNodePath + '/' + relativeDataFolderPath

        # creating OS queryable object for python to work with to find json files in the dataFolderPath calcuated in the previous step
        json_pattern = os.path.join(dataFolderPath, '*.json')

        # store all the json file paths in the dataFolderPath for further processing
        file_list = glob.glob(json_pattern)

        # execution assertion/ui progress update info
        print('looping through all the files to create input data')
        # loop through all the files in the folder and create inputRawDataDF pandas datafram
        for file in file_list:
            data = pd.read_json(file, lines=True)
            data = data.values[0][0]['candles']
            inputRawDataDF = inputRawDataDF.append(data, ignore_index=True)

        inputRawDataDF.columns = [
            'date-time', 'open', 'high', 'low', 'close', 'quantity',
            'dont-know'
        ]

        buffer = inputRawDataDF['date-time']
        add_datepart(inputRawDataDF, 'date-time')

        inputRawDataDF = pd.concat([buffer, inputRawDataDF], axis=1)

        #create prior_holidays feature
        priorHolidaysStamps = getPriorHoliDaysStamps(
            inputRawDataDF['date-timeDayofyear'])
        priorHolidaysStamps_df = pd.DataFrame(
            {'prior_holidays': priorHolidaysStamps[:]})

        inputRawDataDF = pd.concat([inputRawDataDF, priorHolidaysStamps_df],
                                   axis=1)

        #create following_holidays feature
        followingHolidaysStamps = getFollowingHolidaysDaysStamp(
            inputRawDataDF['date-timeDayofyear'])
        followingHolidaysStamps_df = pd.DataFrame(
            {'following_holidays': followingHolidaysStamps[:]})

        inputRawDataDF = pd.concat(
            [inputRawDataDF, followingHolidaysStamps_df], axis=1)
        '''
        w  write mode
        r  read mode
        a  append mode

        w+  create file if it doesn't exist and open it in (over)write mode
            [it overwrites the file if it already exists]
        r+  open an existing file in read+write mode
        a+  create file if it doesn't exist and open it in append mode
        '''

        output_csvdata_path = getParentFolder(dataFolderPath,
                                              2) + '\\processed'
        print('Attempting to create folder if it does not exist >>>' +
              output_csvdata_path)
        createFolder(output_csvdata_path)

        output_file_name = output_csvdata_path + '/' + outputFileName
        print('Attempting to create/update file >>>' + output_file_name)
        #f = open(output_file_name, 'w+')  # open file in append mode
        #f.write('')
        #f.close()
        #np.savetxt(output_file_name, inputRawDataDF, delimiter=",")
        inputRawDataDF.to_csv(output_file_name, sep=',', index=False)

        print(
            'created raw easy to use csv data to be used for preparing training data in the location  >>>'
            + output_file_name)
        returnValue = True
    except:
        print("Error executing method >>> ")
        # exc_type, exc_obj, exc_tb = sys.exc_info()
        # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        # print("Unexpected error:", sys.exc_info())
        # print(exc_type, fname, exc_tb.tb_lineno)

        # http://docs.python.org/2/library/sys.html#sys.exc_info
        exc_type, exc_value, exc_traceback = sys.exc_info(
        )  # most recent (if any) by default
        '''
        Reason this _can_ be bad: If an (unhandled) exception happens AFTER this,
        or if we do not delete the labels on (not much) older versions of Py, the
        reference we created can linger.

        traceback.format_exc/print_exc do this very thing, BUT note this creates a
        temp scope within the function.
        '''

        traceback_details = {
            'filename': exc_traceback.tb_frame.f_code.co_filename,
            'lineno': exc_traceback.tb_lineno,
            'name': exc_traceback.tb_frame.f_code.co_name,
            'type': exc_type.__name__,
            'message': traceback.extract_tb(exc_traceback)
        }

        del (exc_type, exc_value, exc_traceback
             )  # So we don't leave our local labels/objects dangling
        # This still isn't "completely safe", though!
        # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback
        # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]

        print
        print(traceback.format_exc())
        print
        print(traceback_template % traceback_details)
        print

        #traceback.print_exception()
        raise

    finally:
        return [returnValue, output_file_name, outputFileName, inputRawDataDF]
Esempio n. 11
0
    def _process_layer(input_features, date_field, distance_layers, rasters):

        if isinstance(input_features, FeatureLayer):
            input_layer = input_features
            sdf = input_features.query().sdf
        else:
            sdf = input_features
            input_layer = sdf.spatial.to_feature_collection()

        if distance_layers:
            # Use proximity tool
            print("Calculating Distances.")
            count = 1
            for distance_layer in distance_layers:
                output = arcgis.features.use_proximity.find_nearest(
                    input_layer, distance_layer, max_count=1)
                connecting_df = output['connecting_lines_layer'].query().sdf
                near_dist = []

                for i in range(len(connecting_df)):
                    near_dist.append(connecting_df.iloc[i]['Total_Miles'])

                sdf[f'NEAR_DIST_{count}'] = near_dist
                count = count + 1

        # Process Raster Data to get information.
        rasters_data = {}

        original_points = []
        for i in range(len(sdf)):
            original_points.append(sdf.iloc[i]["SHAPE"])

        input_layer_spatial_reference = sdf.spatial._sr
        for raster in rasters:
            raster_type = 0

            if isinstance(raster, tuple):
                if raster[1] is True:
                    raster_type = 1
                raster = raster[0]
            rasters_data[raster.name] = []

            shape_objects_transformed = arcgis.geometry.project(
                original_points, input_layer_spatial_reference,
                raster.extent['spatialReference'])
            for shape in shape_objects_transformed:
                shape['spatialReference'] = raster.extent['spatialReference']
                if isinstance(shape, arcgis.geometry._types.Point):
                    raster_value = raster.read(origin_coordinate=(shape['x'],
                                                                  shape['y']),
                                               ncols=1,
                                               nrows=1)
                    value = raster_value[0][0][0]
                elif isinstance(shape, arcgis.geometry._types.Polygon):
                    xmin, ymin, xmax, ymax = shape.extent
                    start_x, start_y = xmin + (raster.mean_cell_width /
                                               2), ymin + (
                                                   raster.mean_cell_height / 2)
                    values = []
                    while start_y < ymax:
                        while start_x < xmax:
                            if shape.contains(
                                    arcgis.geometry._types.Point({
                                        'x':
                                        start_x,
                                        'y':
                                        start_y,
                                        'sr':
                                        raster.extent['spatialReference']
                                    })):
                                values.append(
                                    raster.read(origin_coordinate=(
                                        start_x - raster.mean_cell_width,
                                        start_y),
                                                ncols=1,
                                                nrows=1)[0][0][0])
                            start_x = start_x + raster.mean_cell_width
                        start_y = start_y + raster.mean_cell_height
                        start_x = xmin + (raster.mean_cell_width / 2)

                    if len(values) == 0:
                        values.append(
                            raster.read(
                                origin_coordinate=(shape.true_centroid['x'] -
                                                   raster.mean_cell_width,
                                                   shape.true_centroid['y']),
                                ncols=1,
                                nrows=1)[0][0][0])
                    if raster_type == 0:
                        value = sum(values) / len(values)
                    else:
                        value = max(values, key=values.count)
                else:
                    raise Exception(
                        "Input features can be point or polygon only.")

                rasters_data[raster.name].append(value)

        # Append Raster data to sdf
        for key, value in rasters_data.items():
            sdf[key] = value

        if date_field:
            try:
                add_datepart(sdf, date_field)
            except:
                pass

        return sdf
Esempio n. 12
0
def knn(data, startAt, stopAt=None):
    """
    Classifies the point between startAt and stopAt with the k-nearest
    neighbors method. Automaticaly finds the best number of neighbors.
    If stopAt is not provided, default value is the length of data.

    Parameters:
        data (pandas.DataFrame): Data returned by prepare_data (may be
                                 differentiated)
        startAt (int): Index where the forecast starts
        stopAt (int): Index where the forecast stops
            (default is None)

    Returns:
        predictions (list): The forecast from startAt up to stopAt
    """

    data_copy = data.copy()

    if (stopAt is None):
        stopAt = len(data_copy)

    periods = stopAt - startAt

    from fastai.tabular import add_datepart
    add_datepart(data_copy, 'Date')
    data_copy.drop('Elapsed', axis=1, inplace=True)

    # setting importance of days before and after weekends
    # we assume that fridays and mondays are more important
    # 0 is Monday, 1 is Tuesday...
    data_copy['mon_fri'] = 0
    data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]),
                              1,
                              inplace=True)
    data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]),
                               0,
                               inplace=True)

    train = data_copy[:startAt]
    valid = data_copy[startAt:stopAt]

    from sklearn import neighbors
    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))

    x_train_scaled = scaler.fit_transform(train.drop('Close', axis=1))
    x_train = pd.DataFrame(x_train_scaled)
    y_train = train['Close']

    x_valid_scaled = scaler.fit_transform(valid.drop('Close', axis=1))
    x_valid = pd.DataFrame(x_valid_scaled)
    y_valid = valid['Close']

    params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]}
    knn = neighbors.KNeighborsRegressor()
    model = GridSearchCV(knn, params, cv=5, iid=False)

    model.fit(x_train, y_train)
    predictions = model.predict(x_valid)

    return predictions
    parser.set_defaults(calculate_time_since_purchase_with_merchant=True)
    args = vars(parser.parse_args())

    trans_df = pd.read_csv(args['transactions_csv'],
                           parse_dates=['purchase_date'])

    # Suppress an annoying warning.
    pd.options.mode.chained_assignment = None  # default='warn'

    # Treat categorical fields as categorical.
    for v in [
            'authorized_flag', 'category_1', 'category_2', 'category_3',
            'merchant_id', 'merchant_category_id', 'subsector_id', 'city_id',
            'state_id'
    ]:
        trans_df[v] = trans_df[v].astype('category').cat.as_ordered()

    # This function takes a date field and turns it into a bunch of useful
    # columns, such as "day of week", "is month end", etc.
    add_datepart(trans_df, 'purchase_date')

    # Sort by date.
    trans_df.sort_values(by=['purchase_Elapsed'], inplace=True)

    # Add new column: time since last purchase (in general or per merchant).
    add_time_since_last_purchase(trans_df)
    if args['calculate_time_since_purchase_with_merchant']:
        add_time_since_last_purchase_with_merchant(trans_df)

    trans_df.to_csv(args['outfile'])
def process_data(df, hist_trans_df, merch_trans_df):
    # Extract more useful information from the `first_active_month` date field.
    add_datepart(df, 'first_active_month')
    df.drop([
        'first_active_monthDay', 'first_active_monthDayofweek',
        'first_active_monthDayofyear', 'first_active_monthIs_month_end',
        'first_active_monthIs_month_start', 'first_active_monthIs_quarter_end',
        'first_active_monthIs_year_end'
    ],
            axis=1,
            inplace=True)

    # Do feature engineering by aggregating data from the transactions tables.

    aggs = {
        'purchase_amount': ['sum', 'mean', 'min', 'max', 'std'],
        'installments': ['sum', 'mean', 'min', 'max', 'std'],
        'month_lag': ['mean', 'min', 'max'],
        'merchant_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'numerical_1': ['sum', 'mean', 'min', 'max', 'std'],
        'numerical_2': ['sum', 'mean', 'min', 'max', 'std'],
        'avg_sales_lag3': ['sum', 'mean', 'min', 'max', 'std'],
        'avg_sales_lag6': ['sum', 'mean', 'min', 'max', 'std'],
        'avg_sales_lag12': ['sum', 'mean', 'min', 'max', 'std'],
        'avg_purchases_lag3': ['sum', 'mean', 'min', 'max', 'std'],
        'avg_purchases_lag6': ['sum', 'mean', 'min', 'max', 'std'],
        'avg_purchases_lag12': ['sum', 'mean', 'min', 'max', 'std'],
        'active_months_lag3': ['sum', 'mean', 'min', 'max', 'std'],
        'active_months_lag6': ['sum', 'mean', 'min', 'max', 'std'],
        'active_months_lag12': ['sum', 'mean', 'min', 'max', 'std'],
        'merchant_category_id_transaction': ['nunique'],
        'merchant_category_id_merchant': ['nunique'],
        'subsector_id_transaction': ['nunique'],
        'subsector_id_merchant': ['nunique'],
        'merchant_group_id': ['nunique'],
        'most_recent_sales_range': ['nunique'],
        'most_recent_purchases_range': ['nunique'],
        'elapsed_since_last_purchase': ['sum', 'mean', 'min', 'max', 'std'],
    }

    # First up we aggregate the data in the `historical_transactions` table.
    hist_trans_aggs = {
        'elapsed_since_last_merch_purchase':
        ['sum', 'mean', 'min', 'max', 'std'],
    }
    print('Aggregating numerical fields from the historical transactions ...')
    add_aggregated_numerical_fields(df,
                                    hist_trans_df,
                                    aggregators={
                                        **aggs,
                                        **hist_trans_aggs
                                    })

    # For the categorical fields, we can't aggregate by taking the mean or sum
    # values, so let's count the occurences of each possible categorical value
    # instead. (Iow, for a category that can be either YES or NO, we count the
    # number of YESes and the number of NOs and use those values.)
    print(
        'Aggregating categorical fields from the historical transactions ...')
    add_aggregated_categorical_fields(
        df,
        hist_trans_df,
        column_names=[
            'authorized_flag', 'category_1_transaction', 'category_1_merchant',
            'category_2', 'category_3', 'category_4',
            'purchase_Is_month_start', 'purchase_Is_month_end',
            'purchase_Year', 'most_recent_sales_range',
            'most_recent_purchases_range'
        ])

    print(
        'Getting top values for categorical fields from the historical transactions ...'
    )
    add_top_categories(
        df,
        hist_trans_df,
        column_names=[
            'authorized_flag', 'category_1_transaction', 'category_1_merchant',
            'category_2', 'category_3', 'category_4',
            'subsector_id_transaction', 'subsector_id_merchant', 'city_id',
            'state_id', 'purchase_Year', 'purchase_Month', 'purchase_Week',
            'purchase_Day', 'purchase_Dayofweek', 'most_recent_sales_range',
            'most_recent_purchases_range'
        ])

    # Next we aggregate the data in the `new_merchants_transactions` table.
    print(
        'Aggregating numerical fields from the new merchant transactions ...')
    add_aggregated_numerical_fields(df,
                                    merch_trans_df,
                                    aggregators=aggs,
                                    prefix='merch_')

    # These ones don't work for the new_merchant_transactions for some reason
    # (missing data?), so let's skip them for now ...
    print(
        'Aggregating categorical fields from the new merchant transactions ...'
    )
    add_aggregated_categorical_fields(
        df,
        merch_trans_df,
        column_names=[
            'authorized_flag', 'category_1_transaction', 'category_1_merchant',
            'category_2', 'category_3', 'category_4',
            'purchase_Is_month_start', 'purchase_Is_month_end',
            'purchase_Year', 'most_recent_sales_range',
            'most_recent_purchases_range'
        ],
        prefix='merch_')

    print(
        'Getting top values for categorical fields from the new merchant transactions ...'
    )
    add_top_categories(
        df,
        merch_trans_df,
        column_names=[
            'authorized_flag', 'category_1_transaction', 'category_1_merchant',
            'category_2', 'category_3', 'category_4',
            'subsector_id_transaction', 'subsector_id_merchant', 'city_id',
            'state_id', 'purchase_Year', 'purchase_Month', 'purchase_Week',
            'purchase_Day', 'purchase_Dayofweek', 'most_recent_sales_range',
            'most_recent_purchases_range'
        ],
        prefix='merch_')
Esempio n. 15
0
plt.figure(figsize=(16, 8))
plt.plot(data['Close'], label='Closing price history')

# Creating new dataframe with only Date and Close
data = data.sort_index(ascending=True, axis=0)
new_data = pd.DataFrame(index=range(0, len(data)), columns=['Date', 'Close'])

for i in range(0, len(data)):
    new_data['Date'][i] = data.index[i]
    new_data['Close'][i] = data['Close'][i]

# Setting index
new_data.index = new_data.Date
new_data.drop('Date', axis=1, inplace=True)
"""
# Adding date features using 'fastai'
add_datepart(new_data, 'Date')
new_data.drop('Elapsed', axis=1, inplace=True)  # elapsed will be the time stamp

# Flagging if date is Monday/Friday
new_data['mon_fri'] = 0
for i in range(0, len(new_data)):
    if (new_data['Dayofweek'][i] == 0 or new_data['Dayofweek'][i] == 4):
        new_data['mon_fri'][i] = 1
    else:
        new_data['mon_fri'][i] = 0
"""

# 80% train, 20% test
dataset = new_data.values
Esempio n. 16
0

# # Plot with plotly
# data = [go.Scatter(
#             x = df['date'],
#             y = df['price'],
#             mode = 'lines')]
#
# layout = dict(xaxis = dict(title = 'date'),
#               yaxis = dict(title = 'USD'))
#
# fig = dict(data=data, layout=layout)
# py.plot(fig, filename='price_plot')

### Feature engineering:
add_datepart(df, 'date', drop=False)
df.drop('Elapsed', axis=1, inplace=True)  # don't need this
df.head(50)

df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]

df.loc[:, 'year'] = LabelEncoder().fit_transform(df['year'])
df[15:25]

df.isnull().sum()


# # Compute the average price for each month
# avg_price_mth = df.groupby("month").agg({'price': 'mean'}).reset_index()
# # Plot
# data = [go.Scatter(
Esempio n. 17
0
"""
Linear Regression
"""
#sorting
data = df.sort_index(ascending=True, axis=0)

#creating a separate dataset
new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close'])

for i in range(0,len(data)):
    new_data['Date'][i] = data['Date'][i]
    new_data['Close'][i] = data['Close'][i]

#create features
add_datepart(new_data, 'Date')
new_data.drop('Elapsed', axis=1, inplace=True)  #elapsed will be the time stamp


#split into train and validation

train = new_data[:7080]
valid = new_data[7080:]

x_train = train.drop('Close', axis=1)
y_train = train['Close']
x_valid = valid.drop('Close', axis=1)
y_valid = valid['Close']


#implement linear regression
def KNearestNeighbours(df, type, split):
    # creating dataframe with date and the target variable
    data = df.sort_index(ascending=True, axis=0)
    new_data = pd.DataFrame(index=range(0, len(df)), columns=['Date', type])

    for i in range(0, len(data)):
        new_data['Date'][i] = data['Date'][i]
        new_data[type][i] = data[type][i]

    # create features
    from fastai.tabular import add_datepart
    add_datepart(new_data, 'Date')
    new_data.drop('Elapsed', axis=1,
                  inplace=True)  # elapsed will be the time stamp, axis=1表示删除列

    new_data['mon_fri'] = 0
    for i in range(0, len(new_data)):
        if (new_data['Dayofweek'][i] == 0 or new_data['Dayofweek'][i] == 4):
            new_data['mon_fri'][i] = 1
        else:
            new_data['mon_fri'][i] = 0

    # split into train and validation
    train = new_data[:split]
    valid = new_data[split:]

    x_train = train.drop(type, axis=1)
    y_train = train[type]
    x_valid = valid.drop(type, axis=1)
    y_valid = valid[type]

    # importing libraries
    from sklearn import neighbors
    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import MinMaxScaler

    # scaling data
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_train_scaled = scaler.fit_transform(x_train)  # 对x_train进行归一化处理
    x_train = pd.DataFrame(x_train_scaled)
    x_valid_scaled = scaler.fit_transform(x_valid)  # 对x_valid进行归一化处理
    x_valid = pd.DataFrame(x_valid_scaled)

    # using gridsearch to find the best parameter
    params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]}
    knn = neighbors.KNeighborsRegressor()
    model = GridSearchCV(knn, params, cv=5)

    # fit the model and make predictions
    model.fit(x_train, y_train)

    preds = model.predict(x_valid)
    rmse = np.sqrt(np.mean(np.power((np.array(y_valid) - np.array(preds)), 2)))
    st.write('RMSE value on validation set:')
    st.write(rmse)

    # plot
    valid['Predictions'] = 0
    valid['Predictions'] = preds

    valid.index = new_data[split:].index
    train.index = new_data[:split].index

    append_data = DataFrame(data={type: [], 'Predictions': []})

    append_data[type] = train[type]
    append_data['Predictions'] = train[type]

    pic = pd.concat(
        [append_data[[type, 'Predictions']], valid[[type, 'Predictions']]],
        axis=0)

    st.line_chart(pic)