Ejemplo n.º 1
0
    def retrain(self):
        """
        Retrain a spacy model for NER using training data.
        """

        # The request provides training data texts, entities, entity types together with the model name and any other arguments
        row_template = ['strData', 'strData', 'strData', 'strData', 'strData']
        col_headers = ['text', 'entity', 'entity_type', 'model_name', 'kwargs']

        # Create a Pandas DataFrame for the request data
        self.request_df = utils.request_df(self.request, row_template,
                                           col_headers)

        # Get the argument strings from the request dataframe
        kwargs = self.request_df.loc[0, 'kwargs']
        # Set the relevant parameters using the argument strings
        self._set_params(kwargs)

        # Check that a model name has been set
        if self.model in ["en_core_web_sm"]:
            err = "Incorrect usage: A name for the custom model needs to be specified."
            raise Exception(err)

        # Transform the training data to spaCy's training data format
        # This call populates the self.train and self.validation (if a test set is specified in the request arguments) objects
        self._prep_data()

        # Retrain the model and calculate evaluation metrics
        # This call saves the retrained model to disk and pepares the self.metrics dataframe for the response
        self._retrain_model()

        # Prepare the response, which will be the evaluation metrics prepared during retraining
        self.response_df = self.metrics

        # Debug information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(11)

        # Send the reponse table description to Qlik
        self._send_table_description("metrics")

        # Finally send the response
        return self.response_df
Ejemplo n.º 2
0
    def _initiate(self, row_template, col_headers):
        """
        Interpret the request data and setup execution parameters
        :
        :row_template: a list of data types expected in the request e.g. ['strData', 'numData']
        :col_headers: a list of column headers for interpreting the request data e.g. ['group', 'item']
        """
                
        # Create a Pandas DataFrame for the request data
        self.request_df = utils.request_df(self.request, row_template, col_headers)

        # Get the argument strings from the request dataframe
        kwargs = self.request_df.loc[0, 'kwargs']
        # Set the relevant parameters using the argument strings
        self._set_params(kwargs)

        # Print the request dataframe to the logs
        if self.debug:
            self._print_log(3)
Ejemplo n.º 3
0
    def get_entities(self, default=True):
        """
        Use spaCy NER to return named entities from text.
        :
        :default=True uses the pre-trained English language models provided by spaCy. 
        :default=False allows the use of a re-trained spaCy model.
        """

        if default:
            # Interpret the request data based on the expected row and column structure
            row_template = ['strData', 'strData', 'strData']
            col_headers = ['key', 'text', 'kwargs']
        else:
            # A model name is required if using a custom spaCy model
            row_template = ['strData', 'strData', 'strData', 'strData']
            col_headers = ['key', 'text', 'model_name', 'kwargs']

        # Create a Pandas DataFrame for the request data
        self.request_df = utils.request_df(self.request, row_template,
                                           col_headers)

        # Get the argument strings from the request dataframe
        kwargs = self.request_df.loc[0, 'kwargs']
        # Set the relevant parameters using the argument strings
        self._set_params(kwargs)

        # Print the request dataframe to the logs
        if self.debug:
            self._print_log(3)

        # Extract named entities for each text in the request dataframe
        self.response_df = self._entity_tagger()

        # Print the response dataframe to the logs
        if self.debug:
            self._print_log(4)

        # Send the reponse table description to Qlik
        self._send_table_description("entities")

        return self.response_df
Ejemplo n.º 4
0
    def __init__(self, request, context, variant="standard"):
        """
        Class initializer.
        :param request: an iterable sequence of RowData
        :param context:
        :param variant: a string to indicate the request format
        :Sets up the input data frame and parameters based on the request
        """

        # Set the request, context and variant variables for this object instance
        self.request = request
        self.context = context
        self.variant = variant

        if variant == "two_dims":
            row_template = ['strData', 'strData', 'numData', 'strData']
            col_headers = ['key', 'dim', 'measure', 'kwargs']
        elif variant == "lat_long":
            row_template = ['strData', 'numData', 'numData', 'strData']
            col_headers = ['key', 'lat', 'long', 'kwargs']
        else:
            row_template = ['strData', 'strData', 'strData']
            col_headers = ['key', 'measures', 'kwargs']

        # Create a Pandas Data Frame for the request data
        self.request_df = utils.request_df(request, row_template, col_headers)

        # Handle null value rows in the request dataset
        self.NaN_df = self.request_df.loc[self.request_df['key'].str.len() ==
                                          0].copy()

        # If null rows exist they will be sliced off and then added back to the response
        if len(self.NaN_df) > 0:
            self.request_df = self.request_df.loc[
                self.request_df['key'].str.len() != 0]

        # Get additional arguments from the 'kwargs' column in the request data
        # Arguments should take the form of a comma separated string: 'arg1=value1, arg2=value2'
        kwargs = self.request_df.loc[0, 'kwargs']
        self._set_params(kwargs)

        # Additional information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            # Increment log counter for the class. Each instance of the class generates a new log.
            self.__class__.log_no += 1

            # Create a log file for the instance
            # Logs will be stored in ..\logs\Cluster Log <n>.txt
            self.logfile = os.path.join(
                os.getcwd(), 'logs', 'Cluster Log {}.txt'.format(self.log_no))

            self._print_log(1)

        # Set up an input Data Frame, excluding the arguments column
        self.input_df = self.request_df.loc[:,
                                            self.request_df.columns.
                                            difference(['kwargs'])]

        # For the two_dims variant we pivot the data to change dim into columns and with key as the index
        if variant == "two_dims":
            self.input_df = self.input_df.pivot(index='key', columns='dim')
        # For the other two variants we also set the index as the 'key' column
        else:
            self.input_df = self.input_df.set_index('key')

            # For the standard variant we split the measures string into multiple columns and make the values numeric
            if variant == "standard":
                self.input_df = pd.DataFrame(
                    [s.split(';') for r in self.input_df.values for s in r],
                    index=self.input_df.index)

                # Convert strings to numbers using locale settings
                self.input_df = self.input_df.applymap(lambda s: utils.atof(s)
                                                       if s else np.NaN)

        # Finally we prepare the data for the clustering algorithm:

        # If scaling does not need to be applied, we just fill in missing values
        if self.scaler == "none":
            self.input_df = utils.fillna(self.input_df, method=self.missing)
        # Otherwise we apply strategies for both filling missing values and then scaling the data
        else:
            self.input_df = utils.scale(self.input_df,
                                        missing=self.missing,
                                        scaler=self.scaler,
                                        **self.scaler_kwargs)

        # For the lat_long variant we do some additional transformations
        if self.variant == "lat_long":
            # The input values are converted to radians
            self.input_df = self.input_df.apply(np.radians)

        if self.debug:
            self._print_log(2)