def retrain(self): """ Retrain a spacy model for NER using training data. """ # The request provides training data texts, entities, entity types together with the model name and any other arguments row_template = ['strData', 'strData', 'strData', 'strData', 'strData'] col_headers = ['text', 'entity', 'entity_type', 'model_name', 'kwargs'] # Create a Pandas DataFrame for the request data self.request_df = utils.request_df(self.request, row_template, col_headers) # Get the argument strings from the request dataframe kwargs = self.request_df.loc[0, 'kwargs'] # Set the relevant parameters using the argument strings self._set_params(kwargs) # Check that a model name has been set if self.model in ["en_core_web_sm"]: err = "Incorrect usage: A name for the custom model needs to be specified." raise Exception(err) # Transform the training data to spaCy's training data format # This call populates the self.train and self.validation (if a test set is specified in the request arguments) objects self._prep_data() # Retrain the model and calculate evaluation metrics # This call saves the retrained model to disk and pepares the self.metrics dataframe for the response self._retrain_model() # Prepare the response, which will be the evaluation metrics prepared during retraining self.response_df = self.metrics # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(11) # Send the reponse table description to Qlik self._send_table_description("metrics") # Finally send the response return self.response_df
def _initiate(self, row_template, col_headers): """ Interpret the request data and setup execution parameters : :row_template: a list of data types expected in the request e.g. ['strData', 'numData'] :col_headers: a list of column headers for interpreting the request data e.g. ['group', 'item'] """ # Create a Pandas DataFrame for the request data self.request_df = utils.request_df(self.request, row_template, col_headers) # Get the argument strings from the request dataframe kwargs = self.request_df.loc[0, 'kwargs'] # Set the relevant parameters using the argument strings self._set_params(kwargs) # Print the request dataframe to the logs if self.debug: self._print_log(3)
def get_entities(self, default=True): """ Use spaCy NER to return named entities from text. : :default=True uses the pre-trained English language models provided by spaCy. :default=False allows the use of a re-trained spaCy model. """ if default: # Interpret the request data based on the expected row and column structure row_template = ['strData', 'strData', 'strData'] col_headers = ['key', 'text', 'kwargs'] else: # A model name is required if using a custom spaCy model row_template = ['strData', 'strData', 'strData', 'strData'] col_headers = ['key', 'text', 'model_name', 'kwargs'] # Create a Pandas DataFrame for the request data self.request_df = utils.request_df(self.request, row_template, col_headers) # Get the argument strings from the request dataframe kwargs = self.request_df.loc[0, 'kwargs'] # Set the relevant parameters using the argument strings self._set_params(kwargs) # Print the request dataframe to the logs if self.debug: self._print_log(3) # Extract named entities for each text in the request dataframe self.response_df = self._entity_tagger() # Print the response dataframe to the logs if self.debug: self._print_log(4) # Send the reponse table description to Qlik self._send_table_description("entities") return self.response_df
def __init__(self, request, context, variant="standard"): """ Class initializer. :param request: an iterable sequence of RowData :param context: :param variant: a string to indicate the request format :Sets up the input data frame and parameters based on the request """ # Set the request, context and variant variables for this object instance self.request = request self.context = context self.variant = variant if variant == "two_dims": row_template = ['strData', 'strData', 'numData', 'strData'] col_headers = ['key', 'dim', 'measure', 'kwargs'] elif variant == "lat_long": row_template = ['strData', 'numData', 'numData', 'strData'] col_headers = ['key', 'lat', 'long', 'kwargs'] else: row_template = ['strData', 'strData', 'strData'] col_headers = ['key', 'measures', 'kwargs'] # Create a Pandas Data Frame for the request data self.request_df = utils.request_df(request, row_template, col_headers) # Handle null value rows in the request dataset self.NaN_df = self.request_df.loc[self.request_df['key'].str.len() == 0].copy() # If null rows exist they will be sliced off and then added back to the response if len(self.NaN_df) > 0: self.request_df = self.request_df.loc[ self.request_df['key'].str.len() != 0] # Get additional arguments from the 'kwargs' column in the request data # Arguments should take the form of a comma separated string: 'arg1=value1, arg2=value2' kwargs = self.request_df.loc[0, 'kwargs'] self._set_params(kwargs) # Additional information is printed to the terminal and logs if the paramater debug = true if self.debug: # Increment log counter for the class. Each instance of the class generates a new log. self.__class__.log_no += 1 # Create a log file for the instance # Logs will be stored in ..\logs\Cluster Log <n>.txt self.logfile = os.path.join( os.getcwd(), 'logs', 'Cluster Log {}.txt'.format(self.log_no)) self._print_log(1) # Set up an input Data Frame, excluding the arguments column self.input_df = self.request_df.loc[:, self.request_df.columns. difference(['kwargs'])] # For the two_dims variant we pivot the data to change dim into columns and with key as the index if variant == "two_dims": self.input_df = self.input_df.pivot(index='key', columns='dim') # For the other two variants we also set the index as the 'key' column else: self.input_df = self.input_df.set_index('key') # For the standard variant we split the measures string into multiple columns and make the values numeric if variant == "standard": self.input_df = pd.DataFrame( [s.split(';') for r in self.input_df.values for s in r], index=self.input_df.index) # Convert strings to numbers using locale settings self.input_df = self.input_df.applymap(lambda s: utils.atof(s) if s else np.NaN) # Finally we prepare the data for the clustering algorithm: # If scaling does not need to be applied, we just fill in missing values if self.scaler == "none": self.input_df = utils.fillna(self.input_df, method=self.missing) # Otherwise we apply strategies for both filling missing values and then scaling the data else: self.input_df = utils.scale(self.input_df, missing=self.missing, scaler=self.scaler, **self.scaler_kwargs) # For the lat_long variant we do some additional transformations if self.variant == "lat_long": # The input values are converted to radians self.input_df = self.input_df.apply(np.radians) if self.debug: self._print_log(2)