Ejemplo n.º 1
0
    def __init__(self, request, context, variant="standard"):
        """
        Class initializer.
        :param request: an iterable sequence of RowData
        :param context:
        :param variant: a string to indicate the request format
        :Sets up the input data frame and parameters based on the request
        """

        # Set the request, context and variant variables for this object instance
        self.request = request
        self.context = context
        self.variant = variant

        if variant == "two_dims":
            row_template = ['strData', 'strData', 'numData', 'strData']
            col_headers = ['key', 'dim', 'measure', 'kwargs']
        elif variant == "lat_long":
            row_template = ['strData', 'numData', 'numData', 'strData']
            col_headers = ['key', 'lat', 'long', 'kwargs']
        else:
            row_template = ['strData', 'strData', 'strData']
            col_headers = ['key', 'measures', 'kwargs']

        # Create a Pandas Data Frame for the request data
        self.request_df = utils.request_df(request, row_template, col_headers)

        # Handle null value rows in the request dataset
        self.NaN_df = self.request_df.loc[self.request_df['key'].str.len() ==
                                          0].copy()

        # If null rows exist they will be sliced off and then added back to the response
        if len(self.NaN_df) > 0:
            self.request_df = self.request_df.loc[
                self.request_df['key'].str.len() != 0]

        # Get additional arguments from the 'kwargs' column in the request data
        # Arguments should take the form of a comma separated string: 'arg1=value1, arg2=value2'
        kwargs = self.request_df.loc[0, 'kwargs']
        self._set_params(kwargs)

        # Additional information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            # Increment log counter for the class. Each instance of the class generates a new log.
            self.__class__.log_no += 1

            # Create a log file for the instance
            # Logs will be stored in ..\logs\Cluster Log <n>.txt
            self.logfile = os.path.join(
                os.getcwd(), 'logs', 'Cluster Log {}.txt'.format(self.log_no))

            self._print_log(1)

        # Set up an input Data Frame, excluding the arguments column
        self.input_df = self.request_df.loc[:,
                                            self.request_df.columns.
                                            difference(['kwargs'])]

        # For the two_dims variant we pivot the data to change dim into columns and with key as the index
        if variant == "two_dims":
            self.input_df = self.input_df.pivot(index='key', columns='dim')
        # For the other two variants we also set the index as the 'key' column
        else:
            self.input_df = self.input_df.set_index('key')

            # For the standard variant we split the measures string into multiple columns and make the values numeric
            if variant == "standard":
                self.input_df = pd.DataFrame(
                    [s.split(';') for r in self.input_df.values for s in r],
                    index=self.input_df.index)

                # Convert strings to numbers using locale settings
                self.input_df = self.input_df.applymap(lambda s: utils.atof(s)
                                                       if s else np.NaN)

        # Finally we prepare the data for the clustering algorithm:

        # If scaling does not need to be applied, we just fill in missing values
        if self.scaler == "none":
            self.input_df = utils.fillna(self.input_df, method=self.missing)
        # Otherwise we apply strategies for both filling missing values and then scaling the data
        else:
            self.input_df = utils.scale(self.input_df,
                                        missing=self.missing,
                                        scaler=self.scaler,
                                        **self.scaler_kwargs)

        # For the lat_long variant we do some additional transformations
        if self.variant == "lat_long":
            # The input values are converted to radians
            self.input_df = self.input_df.apply(np.radians)

        if self.debug:
            self._print_log(2)
Ejemplo n.º 2
0
    def _correlation(request, context):
        """
        Calculate the correlation coefficient for two columns. Scalar function.
        :param request: an iterable sequence of RowData
        :param context:
        :return: the correlation coefficient for each row
        :Qlik expression examples:
        :<AAI Connection Name>.Pearson('1;NA;3;4;5;6.9', ';11;12;;14;')
        :<AAI Connection Name>.Correlation('1;NA;3;4;5;6.9', ';11;12;;14;', 'pearson')
        :Possible values for the third argument are 'pearson', 'kendall' or 'spearman'
        """
        # Iterate over bundled rows
        for request_rows in request:
            response_rows = []

            # Set to True for additional info in terminal and log file
            debug = False

            if debug:
                # Create a log file for the
                logfile = os.path.join(os.getcwd(), 'logs',
                                       'Correlation Log.txt')

                sys.stdout.write("Function Call: {0} \n\n".format(
                    time.ctime(time.time())))
                with open(logfile, 'a') as f:
                    f.write("Function Call: {0} \n\n".format(
                        time.ctime(time.time())))

            # Iterating over rows
            for row in request_rows.rows:
                # Retrieve the value of the parameters
                # Two or Three columns are sent from the client, hence the length of params will be 2 or 3
                params = [col.strData for col in row.duals]

                if debug:
                    sys.stdout.write("\nPARAMETERS:\n\n{0}\n".format(
                        "\n\n".join(str(x) for x in params)))
                    with open(logfile, 'a') as f:
                        f.write("\nPARAMETERS:\n\n{0}\n".format("\n\n".join(
                            str(x) for x in params)))

                # Create lists for the two series
                x = params[0].split(";")
                y = params[1].split(";")

                # Set the correlation type based on the third argument.
                # Default is Pearson if the arg is missing.
                try:
                    corr_type = params[2].lower()
                except IndexError:
                    corr_type = 'pearson'

                if debug:
                    sys.stdout.write(
                        "\n\nx ({0:d} data points):\n{1}\n".format(
                            len(x), " ".join(str(v) for v in x)))
                    sys.stdout.write("\ny ({0:d} data points):\n{1}\n".format(
                        len(y), " ".join(str(v) for v in y)))
                    sys.stdout.write(
                        "\nCorrelation Type: {0}\n\n".format(corr_type))

                    with open(logfile, 'a') as f:
                        f.write("\n\nx ({0:d} data points):\n{1}\n".format(
                            len(x), " ".join(str(v) for v in x)))
                        f.write("\ny ({0:d} data points):\n{1}\n".format(
                            len(y), " ".join(str(v) for v in y)))
                        f.write(
                            "\nCorrelation Type: {0}\n\n".format(corr_type))

                # Check that the lists are of equal length
                if len(x) == len(y) and len(x) > 0:
                    # Create a Pandas data frame using the lists
                    df = pd.DataFrame({'x': [utils.atof(d) for d in x], \
                                       'y': [utils.atof(d) for d in y]})

                    # Calculate the correlation matrix for the two series in the data frame
                    corr_matrix = df.corr(method=corr_type)

                    if debug:
                        sys.stdout.write(
                            "\n\nCorrelation Matrix:\n{}\n".format(
                                corr_matrix.to_string()))
                        with open(logfile, 'a') as f:
                            f.write("\n\nCorrelation Matrix:\n{}\n".format(
                                corr_matrix.to_string()))

                    # Prepare the result
                    if corr_matrix.size > 1:
                        result = corr_matrix.iloc[0, 1]
                    else:
                        result = None
                else:
                    result = None

                # Create an iterable of Dual with a numerical value
                duals = iter([SSE.Dual(numData=result)])

                # Append the row data constructed to response_rows
                response_rows.append(SSE.Row(duals=duals))

            # Yield Row data as Bundled rows
            yield SSE.BundledRows(rows=response_rows)
Ejemplo n.º 3
0
    def _set_params(self, kwargs):
        """
        Set input parameters based on the request.
        :
        :Parameters implemented for the HDBSCAN() function are: algorithm, metric, min_cluster_size, min_samples,
        :p, alpha, cluster_selection_method, allow_single_cluster, match_reference_implementation.
        :More information here: https://hdbscan.readthedocs.io/en/latest/api.html#hdbscan
        :
        :Scaler types implemented for preprocessing data are: StandardScaler, MinMaxScaler, MaxAbsScaler,
        :RobustScaler and QuantileTransformer.
        :More information here: http://scikit-learn.org/stable/modules/preprocessing.html
        :
        :Additional parameters used are: load_script, return, missing, scaler, debug
        """

        # Set the row count in the original request
        self.request_row_count = len(self.request_df) + len(self.NaN_df)

        # Set default values which will be used if arguments are not passed

        # SSE parameters:
        self.load_script = False
        self.result_type = 'labels_'
        self.missing = 'zeros'
        self.scaler = 'robust'
        self.debug = False
        # HDBSCAN parameters:
        self.algorithm = None
        self.metric = None
        self.min_cluster_size = None
        self.min_samples = None
        self.p = None
        self.alpha = None
        self.cluster_selection_method = None
        self.allow_single_cluster = None
        self.match_reference_implementation = None
        # Standard scaler parameters:
        self.with_mean = None
        self.with_std = None
        # MinMaxScaler scaler parameters:
        self.feature_range = None
        # Robust scaler parameters:
        self.with_centering = None
        self.with_scaling = None
        self.quantile_range = None
        # Quantile Transformer parameters:
        self.n_quantiles = None
        self.output_distribution = None
        self.ignore_implicit_zeros = None
        self.subsample = None
        self.random_state = None

        # Adjust default options if variant is two_dims
        if self.variant == "two_dims":
            self.load_script = True

        # Adjust default options if variant is lat_long
        elif self.variant == "lat_long":
            self.scaler = "none"
            self.metric = "haversine"

        # Set optional parameters

        # If the key word arguments were included in the request, get the parameters and values
        if len(kwargs) > 0:

            # The parameter and values are transformed into key value pairs
            args = kwargs.translate(str.maketrans(
                '', '', string.whitespace)).split(",")
            self.kwargs = dict([arg.split("=") for arg in args])

            # Make sure the key words are in lower case
            self.kwargs = {k.lower(): v for k, v in self.kwargs.items()}

            # Set the load_script parameter to determine the output format
            # Set to 'true' if calling the functions from the load script in the Qlik app
            if 'load_script' in self.kwargs:
                self.load_script = 'true' == self.kwargs['load_script'].lower()

            # Set the return type
            # Valid values are: labels, probabilities, cluster_persistence, outlier_scores
            if 'return' in self.kwargs:
                self.result_type = self.kwargs['return'].lower() + '_'

            # Set the strategy for missing data
            # Valid values are: zeros, mean, median, mode
            if 'missing' in self.kwargs:
                self.missing = self.kwargs['missing'].lower()

            # Set the standardization strategy for the data
            # Valid values are: standard, minmax, maxabs, robust, quantile, none
            if 'scaler' in self.kwargs:
                self.scaler = self.kwargs['scaler'].lower()

            # Set the debug option for generating execution logs
            # Valid values are: true, false
            if 'debug' in self.kwargs:
                self.debug = 'true' == self.kwargs['debug'].lower()

            # Set optional parameters for the HDBSCAN algorithmn
            # For documentation see here: https://hdbscan.readthedocs.io/en/latest/api.html#id20

            # Options are: best, generic, prims_kdtree, prims_balltree, boruvka_kdtree, boruvka_balltree
            # Default is 'best'.
            if 'algorithm' in self.kwargs:
                self.algorithm = self.kwargs['algorithm'].lower()

            # The metric to use when calculating distance between instances in a feature array.
            # More information here: https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html#what-about-different-metrics
            # And here: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html
            # Default is 'euclidean' for 'standard' and 'two_dims' variants, and 'haversine' for the lat_long variant.
            if 'metric' in self.kwargs:
                self.metric = self.kwargs['metric'].lower()

            # The minimum size of clusters.
            # The default value is 5.
            if 'min_cluster_size' in self.kwargs:
                self.min_cluster_size = utils.atoi(
                    self.kwargs['min_cluster_size'])

            # The number of samples in a neighbourhood for a point to be considered a core point.
            if 'min_samples' in self.kwargs:
                self.min_samples = utils.atoi(self.kwargs['min_samples'])

            # p value to use if using the minkowski metric.
            if 'p' in self.kwargs:
                self.p = utils.atoi(self.kwargs['p'])

            # A distance scaling parameter as used in robust single linkage.
            if 'alpha' in self.kwargs:
                self.alpha = utils.atof(self.kwargs['alpha'])

            # The method used to select clusters from the condensed tree.
            # Options are: eom, leaf.
            if 'cluster_selection_method' in self.kwargs:
                self.cluster_selection_method = self.kwargs[
                    'cluster_selection_method'].lower()

            # By default HDBSCAN* will not produce a single cluster.
            # Setting this to True will override this and allow single cluster results.
            if 'allow_single_cluster' in self.kwargs:
                self.allow_single_cluster = 'true' == self.kwargs[
                    'allow_single_cluster'].lower()

            # There exist some interpretational differences between this HDBSCAN implementation
            # and the original authors reference implementation in Java.
            # Note that there is a performance cost for setting this to True.
            if 'match_reference_implementation' in self.kwargs:
                self.match_reference_implementation = 'true' == self.kwargs[
                    'match_reference_implementation']

            # Set optional parameters for the scaler functions

            # Parameters for the Standard scaler
            # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
            if self.scaler == 'standard':
                if 'with_mean' in self.kwargs:
                    self.with_mean = 'true' == self.kwargs['with_mean'].lower()
                if 'with_std' in self.kwargs:
                    self.with_std = 'true' == self.kwargs['with_std'].lower()

            # Parameters for the MinMax scaler
            # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
            if self.scaler == 'minmax':
                if 'feature_range' in self.kwargs:
                    self.feature_range = ''.join(
                        c for c in self.kwargs['feature_range']
                        if c not in '()').split(';')
                    self.feature_range = (utils.atoi(self.feature_range[0]),
                                          utils.atoi(self.feature_range[1]))

            # Parameters for the Robust scaler
            # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
            if self.scaler == 'robust':
                if 'with_centering' in self.kwargs:
                    self.with_centering = 'true' == self.kwargs[
                        'with_centering'].lower()
                if 'with_scaling' in self.kwargs:
                    self.with_scaling = 'true' == self.kwargs[
                        'with_scaling'].lower()
                if 'quantile_range' in self.kwargs:
                    self.quantile_range = ''.join(
                        c for c in self.kwargs['quantile_range']
                        if c not in '()').split(';')
                    self.quantile_range = (utils.atof(self.quantile_range[0]),
                                           utils.atof(self.quantile_range[1]))

            # Parameters for the Quantile Transformer
            # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
            if self.scaler == 'quantile':
                if 'n_quantiles' in self.kwargs:
                    self.n_quantiles = utils.atoi(self.kwargs['n_quantiles'])
                if 'output_distribution' in self.kwargs:
                    self.output_distribution = self.kwargs[
                        'output_distribution'].lower()
                if 'ignore_implicit_zeros' in self.kwargs:
                    self.ignore_implicit_zeros = 'true' == self.kwargs[
                        'ignore_implicit_zeros'].lower()
                if 'subsample' in self.kwargs:
                    self.subsample = utils.atoi(self.kwargs['subsample'])
                if 'random_state' in self.kwargs:
                    self.random_state = utils.atoi(self.kwargs['random_state'])

        # Set up a list of possible key word arguments for the HDBSCAN() function
        hdbscan_params = ['algorithm', 'metric', 'min_cluster_size', 'min_samples', 'p', 'alpha',\
                          'cluster_selection_method', 'allow_single_cluster', 'match_reference_implementation']

        # Create dictionary of key word arguments for the HDBSCAN() function
        self.hdbscan_kwargs = self._populate_dict(hdbscan_params)

        # Set up a list of possible key word arguments for the sklearn preprocessing functions
        scaler_params = ['with_mean', 'with_std', 'feature_range', 'with_centering', 'with_scaling',\
                        'quantile_range', 'n_quantiles', 'output_distribution', 'ignore_implicit_zeros',\
                        'subsample', 'random_state']

        # Create dictionary of key word arguments for the scaler functions
        self.scaler_kwargs = self._populate_dict(scaler_params)
Ejemplo n.º 4
0
    def _set_params(self):
        """
        Set input parameters based on the request.
        Parameters implemented for the Prophet() function are: growth, cap, floor, changepoint_prior_scale, interval_width 
        Parameters implemented for the make_future_dataframe() function are: freq, periods
        Parameters implemented for seasonality are: add_seasonality, seasonality_period, seasonality_fourier, seasonality_prior_scale
        Parameters implemented for holidays are: holidays_prior_scale, lower_window, upper_window
        Additional parameters for seasonlity requests are: weekly_start, yearly_start
        Additional parameters used are: return, take_log, seasonality, debug
        """

        # Calculate the forecast periods based on the number of placeholders in the data
        self.periods = utils.count_placeholders(self.request_df.loc[:, 'y'])

        # Set the row count in the original request
        self.request_row_count = len(self.request_df) + len(self.NaT_df)

        # Set default values which will be used if an argument is not passed
        self.load_script = False
        self.result_type = 'yhat'
        self.take_log = False
        self.seasonality = 'yearly'
        self.seasonality_mode = None
        self.debug = False
        self.freq = 'D'
        self.cap = None
        self.floor = None
        self.growth = None
        self.changepoint_prior_scale = None
        self.interval_width = None
        self.name = None
        self.period = None
        self.fourier_order = None
        self.mode = None
        self.seasonality_prior_scale = None
        self.holidays_prior_scale = None
        self.mcmc_samples = None
        self.seed = None
        self.n_changepoints = None
        self.changepoint_range = None
        self.uncertainty_samples = None
        self.is_seasonality_request = False
        self.weekly_start = 6  # Defaulting to a Monday start for the week as used in Qlik
        self.yearly_start = 0
        self.lower_window = None
        self.upper_window = None

        # Set optional parameters

        # Check if there is a fourth column in the request
        try:
            # If there is a fourth column, it is assumed to contain the key word arguments
            args = self.request[0].rows[0].duals[3].strData

            # The third column should then provide the holiday name or null for each row
            self.has_holidays = True

        except IndexError:
            # If there is no fourth column, the request does not include holidays
            self.has_holidays = False

        # If the fourth column did not exist, we try again with the third column
        if not self.has_holidays:
            try:
                args = self.request[0].rows[0].duals[2].strData
            except IndexError:
                args = None

        # If the key word arguments were included in the request, get the parameters and values
        if args is not None:

            # The parameter and values are transformed into key value pairs
            args = args.translate(str.maketrans('', '',
                                                string.whitespace)).split(",")
            self.kwargs = dict([arg.split("=") for arg in args])

            # Make sure the key words are in lower case
            self.kwargs = {k.lower(): v for k, v in self.kwargs.items()}

            # Set the load_script parameter to determine the output format
            # Set to 'true' if calling the functions from the load script in the Qlik app
            if 'load_script' in self.kwargs:
                self.load_script = 'true' == self.kwargs['load_script'].lower()

            # Set the return type
            # Valid values are: yhat, trend, seasonal, seasonalities.
            # Add _lower or _upper to the series name to get lower or upper limits.
            if 'return' in self.kwargs:
                self.result_type = self.kwargs['return'].lower()

            # Set the option to take a logarithm of y values before forecast calculations
            # Valid values are: true, false
            if 'take_log' in self.kwargs:
                self.take_log = 'true' == self.kwargs['take_log'].lower()

            # Set the type of seasonlity requested. Used only for seasonality requests
            # Valid values are: yearly, weekly, monthly, holidays
            if 'seasonality' in self.kwargs:
                self.seasonality = self.kwargs['seasonality'].lower()

            # Set the seasonlity mode. Useful if the seasonality is not a constant additive factor as assumed by Prophet
            # Valid values are: additive, multiplicative
            if 'seasonality_mode' in self.kwargs:
                self.seasonality_mode = self.kwargs['seasonality_mode'].lower()

            # Set the debug option for generating execution logs
            # Valid values are: true, false
            if 'debug' in self.kwargs:
                self.debug = 'true' == self.kwargs['debug'].lower()

            # Set the frequency of the timeseries
            # Any valid frequency for pd.date_range, such as 'D' or 'M'
            # For options see: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
            if 'freq' in self.kwargs:
                self.freq = self.kwargs['freq']

            # Set the cap which adds an upper limit at which the forecast will saturate
            # This changes the default linear growth model to a logistic growth model
            if 'cap' in self.kwargs:
                self.cap = utils.atof(self.kwargs['cap'])
                self.growth = 'logistic'

                # Set the floor which adds a lower limit at which the forecast will saturate
                # To use a logistic growth trend with a floor, a cap must also be specified
                if 'floor' in self.kwargs:
                    self.floor = utils.atof(self.kwargs['floor'])

            # Set the changepoint_prior_scale to adjust the trend flexibility
            # If the trend changes are being overfit (too much flexibility) or underfit (not enough flexibility),
            # you can adjust the strength of the sparse prior.
            # Default value is 0.05. Increasing it will make the trend more flexible.
            if 'changepoint_prior_scale' in self.kwargs:
                self.changepoint_prior_scale = utils.atof(
                    self.kwargs['changepoint_prior_scale'])

            # Set the width for the uncertainty intervals
            # Default value is 0.8 (i.e. 80%)
            if 'interval_width' in self.kwargs:
                self.interval_width = utils.atof(self.kwargs['interval_width'])

            # Set additional seasonality to be added to the model
            # Default seasonalities are yearly and weekly, as well as daily for sub daily data
            if 'add_seasonality' in self.kwargs:
                self.name = self.kwargs['add_seasonality'].lower()

            # Set 'additive' or 'multiplicative' mode for the additional seasonality
            # Default value follows the seasonality_mode parameter
            if 'add_seasonality_mode' in self.kwargs:
                self.mode = self.kwargs['add_seasonality_mode'].lower()

            # Set the seasonality period
            # e.g. 30.5 for 'monthly' seasonality
            if 'seasonality_period' in self.kwargs:
                self.period = utils.atof(self.kwargs['seasonality_period'])

            # Set the seasonality fourier terms
            # Increasing the number of Fourier terms allows the seasonality to fit faster changing cycles,
            # but can also lead to overfitting
            if 'seasonality_fourier' in self.kwargs:
                self.fourier_order = int(self.kwargs['seasonality_fourier'])

            # Set the seasonality prior scale to smooth seasonality effects.
            # Reducing this parameter dampens seasonal effects
            if 'seasonality_prior_scale' in self.kwargs:
                self.seasonality_prior_scale = utils.atof(
                    self.kwargs['seasonality_prior_scale'])

            # Set the holiday prior scale to smooth holiday effects.
            # Reducing this parameter dampens holiday effects. Default is 10, which provides very little regularization.
            if 'holidays_prior_scale' in self.kwargs:
                self.holidays_prior_scale = utils.atof(
                    self.kwargs['holidays_prior_scale'])

            # Set the number of MCMC samples.
            # If greater than 0, Prophet will do full Bayesian inference with the specified number of MCMC samples.
            # If 0, Prophet will do MAP estimation. Default is 0.
            if 'mcmc_samples' in self.kwargs:
                self.mcmc_samples = utils.atoi(self.kwargs['mcmc_samples'])

            # Random seed that can be used to control stochasticity.
            # Used for setting the numpy random seed used in predict and also for pystan when using mcmc_samples>0.
            if 'random_seed' in self.kwargs:
                self.seed = utils.atoi(self.kwargs['random_seed'])

                # Set the random seed for numpy
                np.random.seed(self.seed)

            # Number of potential changepoints to include. Default value is 25.
            # Potential changepoints are selected uniformly from the first `changepoint_range` proportion of the history.
            if 'n_changepoints' in self.kwargs:
                self.n_changepoints = utils.atoi(self.kwargs['n_changepoints'])

            # Proportion of history in which trend changepoints will be estimated.
            # Defaults to 0.8 for the first 80%.
            if 'changepoint_range' in self.kwargs:
                self.changepoint_range = utils.atof(
                    self.kwargs['changepoint_range'])

            # Number of simulated draws used to estimate uncertainty intervals.
            if 'uncertainty_samples' in self.kwargs:
                self.uncertainty_samples = utils.atoi(
                    self.kwargs['uncertainty_samples'])

            # Set the weekly start for 'weekly' seasonality requests
            # Default week start is 0 which represents Sunday. Add offset as required.
            if 'weekly_start' in self.kwargs:
                self.weekly_start = utils.atoi(self.kwargs['weekly_start'])

            # Set the weekly start for 'yearly' seasonality requests
            # Default week start is 0 which represents 1st of Jan. Add offset as required.
            if 'yearly_start' in self.kwargs:
                self.yearly_start = utils.atoi(self.kwargs['yearly_start'])

            # Set a period to extend the holidays by lower_window number of days before the date.
            # This can be used to extend the holiday effect
            if 'lower_window' in self.kwargs:
                self.lower_window = utils.atoi(self.kwargs['lower_window'])

            # Set a period to extend the holidays by upper_window number of days after the date.
            # This can be used to extend the holiday effect
            if 'upper_window' in self.kwargs:
                self.upper_window = utils.atoi(self.kwargs['upper_window'])

        # Create dictionary of arguments for the Prophet(), make_future_dataframe(), add_seasonality() and fit() functions
        self.prophet_kwargs = {}
        self.make_kwargs = {}
        self.add_seasonality_kwargs = {}
        self.fit_kwargs = {}

        # Populate the parameters in the corresponding dictionary:

        # Set up a list of possible key word arguments for the Prophet() function
        prophet_params = ['seasonality_mode', 'growth', 'changepoint_prior_scale', 'interval_width',\
                          'seasonality_prior_scale', 'holidays_prior_scale', 'mcmc_samples', 'n_changepoints',\
                          'changepoint_range', 'uncertainty_samples']

        # Create dictionary of key word arguments for the Prophet() function
        self.prophet_kwargs = self._populate_dict(prophet_params)

        # Set up a list of possible key word arguments for the make_future_dataframe() function
        make_params = ['periods', 'freq']

        # Create dictionary of key word arguments for the make_future_dataframe() function
        self.make_kwargs = self._populate_dict(make_params)

        # Set up a list of possible key word arguments for the add_seasonality() function
        seasonality_params = ['name', 'period', 'fourier_order', 'mode']

        # Create dictionary of key word arguments for the add_seasonality() function
        self.add_seasonality_kwargs = self._populate_dict(seasonality_params)

        # Pass the random seed to the fit method if MCMC is being used
        if self.mcmc_samples is not None and self.mcmc_samples > 0:
            # Set up a list of possible key word arguments for the fit() function
            fit_params = ['seed']
            # Create dictionary of key word arguments for the fit() function
            self.fit_kwargs = self._populate_dict(fit_params)
Ejemplo n.º 5
0
    def init_seasonality(cls, request, context):
        """
        Alternative initialization method for this class
        Used when the request contains the timeseries as a contatenated string, repeated for every row
        This is used when the number of input data points differs from the output rows required for seasonality plots
        """

        # The rows are duplicates in this kind of request, so inputs are simply taken from the first row
        # First we store the correct number of rows to be output.
        request_row_count = len(
            [row for request_rows in request for row in request_rows.rows])
        # The timeseries is accepted as a string from the second column of the first row
        timeseries = request[0].rows[0].duals[1].strData
        # The holidays are taken from the third column of the first row
        holidays = request[0].rows[0].duals[2].strData
        # The key word arguments are taken from the fourth column of the first row
        args = request[0].rows[0].duals[3]

        # The data may be sent unsorted by Qlik, so we have to store the order to use when sending the results
        sort_order = pd.DataFrame([(row.duals[0].numData, row.duals[0].strData) \
                                        for request_rows in request \
                                        for row in request_rows.rows], \
                                       columns=['seasonality_num', 'seasonality_str'])

        # We ignore Null values here as these are handled separately in the response
        sort_order = sort_order.loc[sort_order.seasonality_num.notnull()]

        # The correct sort order is based on the data frame's index after sorting on the seasonality field
        sort_order = sort_order.sort_values('seasonality_num')

        # Re-create the request with ds and y columns
        pairs = timeseries.split(";")
        request_df = pd.DataFrame([p.split(":") for p in pairs],
                                  columns=['ds', 'y'])

        # Convert strings to numeric values, replace conversion errors with Null values
        request_df = request_df.applymap(lambda s: utils.atof(s)
                                         if s else np.NaN)

        # Check if the holidays column is populated
        if len(holidays) > 0:
            # Create a holidays data frame
            pairs = holidays.split(";")
            holiday_df = pd.DataFrame([p.split(":") for p in pairs],
                                      columns=['ds', 'holiday'])

            # Merge the holidays with the request data frame using column ds as key
            request_df = pd.merge(request_df, holiday_df, on='ds', how='left')

            # Replace null values in the holiday column with empty strings
            request_df = request_df.fillna(value={'holiday': ''})

        # Values in the data frame are converted to type SSE.Dual
        request_df.loc[:, 'ds'] = request_df.loc[:, 'ds'].apply(
            lambda result: SSE.Dual(numData=result))
        request_df.loc[:, 'y'] = request_df.loc[:, 'y'].apply(
            lambda result: SSE.Dual(numData=result))
        if 'holiday' in request_df.columns:
            request_df.loc[:, 'holiday'] = request_df.loc[:, 'holiday'].apply(
                lambda result: SSE.Dual(strData=result))

        # Add the keyword arguments to the data frame as well, already of type SSE.Dual
        request_df.loc[:, 'args'] = args

        # Create the updated request list and convert to SSE data types
        request_list = request_df.values.tolist()
        request_list = [SSE.Row(duals=duals) for duals in request_list]
        updated_request = [SSE.BundledRows(rows=request_list)]

        # Call the default initialization method
        instance = ProphetForQlik(updated_request, context)

        # Handle null value row in the request dataset
        instance.NaT_df = request_df.loc[request_df.ds.isnull()].copy()

        # If such a row exists it will be sliced off and then added back to the response
        if len(instance.NaT_df) > 0:
            instance.NaT_df.loc[:, 'y'] = 0

        # Set a property that lets us know this instance was created for seasonality forecasts
        instance.is_seasonality_request = True

        # Set a property that lets us know the row count in the original request as this will be different from request_df
        instance.request_row_count = request_row_count

        # Update the default result type if this was not passed in arguments
        if instance.result_type == 'yhat':
            instance.result_type = instance.seasonality

        # Set the sort order to be used when returning the results
        instance.sort_order = sort_order

        # Return the initialized ProphetForQlik instance
        return instance
Ejemplo n.º 6
0
    def _set_params(self, kwargs):
        """
        Set input parameters based on the request.
        :
        :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools
        """

        # Set default values which will be used if execution arguments are not passed

        # Default parameters:
        self.debug = False
        self.model = 'en_core_web_sm'
        self.custom = False
        self.base_model = 'en_core_web_sm'
        self.blank = False
        self.epochs = 100
        self.batch_size = compounding(4.0, 32.0, 1.001)
        self.drop = 0.25
        self.test = 0

        # Extract the model path if required
        try:
            # Get the model name from the first row in the request_df
            self.model = self.request_df.loc[0, 'model_name']

            # Remove the model_name column from the request_df
            self.request_df = self.request_df.drop(['model_name'], axis=1)
        except KeyError:
            pass

        # If key word arguments were included in the request, get the parameters and values
        if len(kwargs) > 0:

            # Transform the string of arguments into a dictionary
            self.kwargs = utils.get_kwargs(kwargs)

            # Set the debug option for generating execution logs
            # Valid values are: true, false
            if 'debug' in self.kwargs:
                self.debug = 'true' == self.kwargs['debug'].lower()

                # Additional information is printed to the terminal and logs if the paramater debug = true
                if self.debug:
                    # Increment log counter for the class. Each instance of the class generates a new log.
                    self.__class__.log_no += 1

                    # Create a log file for the instance
                    # Logs will be stored in ..\logs\SpaCy Log <n>.txt
                    self.logfile = os.path.join(
                        os.getcwd(), 'logs',
                        'SpaCy Log {}.txt'.format(self.log_no))

                    self._print_log(1)

            # Set whether the model (if getting named entites) or base model (if retraining) is a custom model
            # i.e. not one of the pre-trained models provided by spaCy
            if 'custom' in self.kwargs:
                self.custom = 'true' == self.kwargs['custom'].lower()

            # Set the base model, i.e an existing spaCy model to be retrained.
            if 'base_model' in self.kwargs:
                self.base_model = self.kwargs['base_model'].lower()

            # Set the retraining to be done on a blank Language class
            if 'blank' in self.kwargs:
                self.blank = 'true' == self.kwargs['blank'].lower()

            # Set the epochs for training the model.
            # This is the the number times that the learning algorithm will work through the entire training dataset.
            # Valid values are an integer e.g. 200
            if 'epochs' in self.kwargs:
                self.epochs = utils.atoi(self.kwargs['epochs'])

            # Set the batch size to be used during model training.
            # The model's internal parameters will be updated at the end of each batch.
            # Valid values are a single integer or compounding or decaying parameters.
            if 'batch_size' in self.kwargs:
                # The batch size may be a single integer
                try:
                    self.batch_size = utils.atoi(self.kwargs['batch_size'])
                # Or a list of floats
                except ValueError:
                    sizes = utils.get_kwargs_by_type(self.kwargs['batch_size'])

                    # If the start < end, batch sizes will be compounded
                    if sizes[0] < sizes[1]:
                        self.batch_size = compounding(sizes[0], sizes[1],
                                                      sizes[2])
                    # else bath sizes will decay during training
                    else:
                        self.batch_size = decaying(sizes[0], sizes[1],
                                                   sizes[2])

            # Set the dropout rate for retraining the model
            # This determines the likelihood that a feature or internal representation in the model will be dropped,
            # making it harder for the model to memorize the training data.
            # Valid values are a float lesser than 1.0 e.g. 0.35
            if 'drop' in self.kwargs:
                self.drop = utils.atof(self.kwargs['drop'])

            # Set the ratio of data to be used for testing.
            # This data will be held out from training and just used to provide evaluation metrics.
            # Valid values are a float >= zero and < 1.0 e.g. 0.3
            if 'test' in self.kwargs:
                self.test = utils.atof(self.kwargs['test'])

        # Debug information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(2)

        # Remove the kwargs column from the request_df
        self.request_df = self.request_df.drop(['kwargs'], axis=1)
Ejemplo n.º 7
0
    def _prep_regressors(self):
        """
        Parse the request for additional regressors and arguments.
        The regressors are expected as a string of pipe separated values.
        e.g. a single entry with three regressors could be '1.2|200|3'
        
        Arguments for the regressors can be passed in a separate string of keyword arguments.
        The keyword and the value should be separated by equals signs, different keywords by commas, and arguments for different regressors by pipe.
        If a single set of arguments is provided (i.e. no pipe characters are found), we apply the same arguments to all regressors.
        e.g. 'prior_scale=10, mode=additive| mode=multiplicative| mode=multiplicative' for specifying different arguments per regressor
              or 'mode=additive' for using the same arguments for all regressors.

        Returns a data frame with the additional regressors.
        """

        # Create a Pandas Data Frame with additional regressors and their keyword arguments
        self.regressors_df = pd.DataFrame([(row.duals[0].numData, row.duals[3].strData, row.duals[4].strData) \
            for request_rows in self.request \
                for row in request_rows.rows], \
                    columns=['ds', 'regressors', 'kwargs'])

        # Handle null value rows in the request dataset
        self.regressors_df = self.regressors_df.loc[
            self.regressors_df.ds.notnull()]

        # Check if the regressors column is empty
        if len(self.regressors_df.regressors.unique()) == 1:
            # Return without further processing
            self.has_regressors = False
            if self.debug:
                self._print_log(7)
            return None

        # Get the regressor arguments as a string
        arg_string = self.regressors_df.loc[0, 'kwargs']

        # Add kwargs for regressors to a list of dictionaries
        self.regressor_kwargs = []
        for kwargs_string in arg_string.replace(' ', '').split('|'):
            if len(kwargs_string) > 0:
                kwargs = {}
                for kv in kwargs_string.split(','):
                    pair = kv.split('=')
                    if 'prior_scale' in pair[0]:
                        pair[1] = utils.atof(pair[1])
                    if 'standardize' in pair[0] and pair[1].lower() != 'auto':
                        pair[1] = 'true' == pair[1].lower()
                    kwargs[pair[0]] = pair[1]
                self.regressor_kwargs.append(kwargs)

        # Split up the additional regressors into multiple columns
        self.regressors_df = pd.DataFrame(self.regressors_df.regressors.str.split('|', expand=True).values, \
            index=self.regressors_df.index).add_prefix('regressor_')

        # Convert the strings to floats
        self.regressors_df = self.regressors_df.applymap(utils.atof)

        # Copy dates from the request_df
        self.regressors_df.loc[:, 'ds'] = self.request_df.loc[:, 'ds'].copy()

        # Sort by the ds column and reset indexes
        self.regressors_df = self.regressors_df.sort_values('ds').reset_index(
            drop=True).drop(columns=['ds'])

        # If there are no regressor kwargs add empty dictionaries
        if len(self.regressor_kwargs) == 0:
            self.regressor_kwargs = [{} for c in self.regressors_df.columns]
        # If there is just 1 dictionary, replicate it for each regressor
        elif len(self.regressor_kwargs) == 1:
            kwargs = self.regressor_kwargs[0].copy()
            self.regressor_kwargs = [
                kwargs for c in self.regressors_df.columns
            ]
        elif len(self.regressor_kwargs) != len(self.regressors_df.columns):
            err = "The number of additional regressors does not match the keyword arguments provided for the regressors."
            raise IndexError(err)

        return self.regressors_df