class SeqInference(): """Handles the inference loop for a single message""" def __init__( self, vocab_filepath, encoder_model, decoder_model, model_spec_file, data_spec_file, method='arg_max', beam_width=3, dictionary_dir=None, max_decoder_seq_length=28, verbose=0, pipeline='word2vec', word2vec_fpath='', **kwargs, ): self.data_spec_file = load_yaml(data_spec_file) self.model_spec_file = load_yaml(model_spec_file) self.encoder_model = encoder_model self.decoder_model = decoder_model self.method = method self.beam_width = beam_width self.max_decoder_seq_length = max_decoder_seq_length self.dictionary_dir = dictionary_dir self.verbose = verbose log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) self.logger = logging.getLogger(__name__) self._get_tags_from_spec() # Pipeline Elements self.int_tokenizer = IntegerTokenizer(vocab_filepath) rs = RemoveCharsTransformer(self.data_spec_file['punc_list']) ws = WhiteSpaceTokenizer() self.tg = Tagger() self.available_pipelines = ['integertokenizer', 'word2vec'] self.pipe = Pipeline(steps=[('remove_chars', rs), ('white_space_tokenizer', ws)]) if self.dictionary_dir is not None: self.pipe.append(('spelling', st)) # self.pipe.append(('tagger', tg)) if pipeline == 'integertokenizer': self.pipe.append(('integer_tokenizer', self.int_tokenizer)) elif pipeline == 'word2vec': word2vec = Word2Vec() word2vec.set_model( KeyedVectors.load_word2vec_format(word2vec_fpath)) self.pipe.append(('integer_tokenizer', self.word2vec)) else: raise KeyError( f'Unavailablve pipeline specified, please choose one of {self.available_pipelines}' ) def _log(self, message): if self.verbose > 1: self.logger.info(message) elif self.verbose == 1: print(message) def _get_tags_from_spec(self): self.START_TOKEN = self.data_spec_file['tokens']['START_TOKEN'] self.END_TOKEN = self.data_spec_file['tokens']['END_TOKEN'] self.START_TAG = self.data_spec_file['tags'][self.START_TOKEN] self.END_TAG = self.data_spec_file['tags'][self.END_TOKEN] def predict_response_from_text(self, message): reverse = self.model_spec_file['tf_dataset_params']['reverse_context'] self._log(f'Tokenizing mesage: {message}') input_vectors = np.squeeze(self.process_message([message], reverse)) output_tokens = self._predict_response_from_tokens(input_vectors) response = " ".join( self.int_tokenizer.inverse_transform(output_tokens)) response = np.squeeze(self.tg.inverse_transform([response])) return response def process_message(self, input_string, reverse): """turn string into a tokenized array""" # Settings input_as_int = self.pipe.transform(input_string) if reverse: input_as_int = input_as_int[::-1] return np.asarray(input_as_int) def strip_tags_from_text(self, message): tag_match = re.compile('<[a-z]{3}>') message = tag_match.sub('', message).strip() message = re.sub(' +', ' ', message) return message def _predict_response_from_tokens(self, context): self._log(f'Sending context: {context} to encoder') # Encoded the context (messages up to this point) encoder_states = get_encoder_state(self.encoder_model, context) # Inital value of target sequence is the start sequence tag start_token = np.array(self.START_TOKEN) # First state for the decoder is the encoder states states = encoder_states if self.method == 'arg_max': decoded_tokens = self._argmax_loop(start_token, encoder_states) elif self.method == 'beam_search': decoded_tokens = self._beam_search_loop(start_token, encoder_states) return decoded_tokens def _argmax_loop(self, target_seq, states): stop_condition = False decoded_tokens = [] while not stop_condition: output_tokens, states = self._predict_next_char(target_seq, states) # Collapse the output tokens to a single vector probs = tf.squeeze(output_tokens) sampled_index, p = argmax_select(probs) sampled_word = self.int_tokenizer.inverse_transform(sampled_index) sampled_vector = self.pipe.transform(sampled_word) target_seq = tf.convert_to_tensor([[sampled_vector]]) # Exit condition: either hit max length or find stop character. if (sampled_index == self.END_TAG or len(decoded_tokens) > self.max_decoder_seq_length): stop_condition = True else: decoded_tokens.append(sampled_index) return decoded_tokens def _beam_search_loop(self, inital_target_seq, encoder_states): beam_width = self.beam_width stop_condition = False decoded_tokens = [] vocab_length = len(self.tokenizer.vocab) # prob beam sequence keeps track of P(seq)for each beam and is updated at each iter log_prob_beam_seq = np.log(np.ones((beam_width, 1))) # prob_char_given_prev tracks conditional probability of all characters given the previous log_prob_char_given_prev = np.empty((beam_width, vocab_length)) beam_seq = np.empty((beam_width, 1), dtype=np.int32) beam_seq[:, 0] = [inital_target_seq] * 3 beam_has_ended = [False] * beam_width final_tokens = [] final_log_probs = [] first_char = True stop_condition = False while not stop_condition: if first_char: decoder_output, states_values = self._predict_next_char( np.asarray([inital_target_seq]), encoder_states) log_prob_char_given_prev[0] = np.log( tf.squeeze(decoder_output).numpy()) beam_states = [states_values] * beam_width else: for beam in range(beam_width): if not beam_has_ended[beam]: # Last character of the beam sequence is the input for the decoder # Convert beam_seq which has integer words into vectors prev_word = self.int_tokenizer.inverse_transform( beam_seq[beam][-1]) prev_word_as_vectors = self.pipe.transform(prev_word) decoder_input = np.asarray([prev_word_as_vectors]) decoder_output, states_values = self._predict_next_char( decoder_input, beam_states[beam]) log_prob_char_given_prev[beam] = np.log( tf.squeeze(decoder_output).numpy()) beam_states[beam] = states_values else: log_prob_char_given_prev[beam] = [-np.inf ] * vocab_length # Probability of all characters if first_char: log_prob_seq = log_prob_char_given_prev[0] + log_prob_beam_seq[ 0] log_prob_seq = log_prob_seq.reshape((1, -1)) else: log_prob_seq = log_prob_char_given_prev + log_prob_beam_seq assert log_prob_seq.shape == (beam_width, vocab_length) # Carry forward the top beam_width top_n, log_p = get_top_n(log_prob_seq, beam_width) log_prob_beam_seq = log_p.reshape((-1, 1)) # Add top_n to the beam_seq beam_states = [beam_states[i] for i in top_n[0]] beam_seq[:, :] = [beam_seq[:, :][i] for i in top_n[0]] beam_seq = np.hstack((beam_seq, top_n[1].reshape(beam_width, 1))) self._log(f'Current beam sequence \n {beam_seq}') for beam in range(beam_width): if beam_seq[beam, -1] == self.END_TOKEN or len( beam_seq[beam, :]) >= self.max_decoder_seq_length: beam_has_ended[beam] = True self._log(f'Appending {beam_seq[beam]} to final tokens') if len(beam_seq[beam]) > 3: final_tokens.append(np.array(beam_seq[beam])) final_log_probs.append( np.array(log_prob_beam_seq[beam])) if len(final_tokens) >= beam_width: stop_condition = True first_char = False # End of while loop return final_tokens[np.argmax(final_log_probs)] def _predict_next_char(self, target_seq, states_value): """Take a single input and lstm state and predict the next item and state""" output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) states_value = [h, c] return output_tokens, states_value
def fit_transform(self, X: dt.Frame, y: np.array = None, append=False): y_ = y new_data = [] if self.loaded: X_ = X.to_pandas() N_ = len(X_) for col in self.input_feature_names: if self.TextTransformer.tf_idf: # train new TfidfVectorizer in order to expand vocabulary of the old one and adjust idf terms cv = TfidfVectorizer() pre_trained = self.TextTransformer.pipes[col][0]["model"] cv.set_params(**pre_trained.get_params()) pipe_ = copy.deepcopy(self.TextTransformer.pipes[col][0]) new_pipe = [] for step in pipe_.steps: if step[0] != 'model': new_pipe.append(step) else: new_pipe.append(('model', cv)) break new_pipe = Pipeline(new_pipe) new_pipe.fit(self.TextTransformer.stringify_col(X_[col])) freq2 = self.inverse_idf(cv.idf_, N_) freq = self.inverse_idf( pre_trained.idf_, self.TextTransformer.N_ ) # adjust vocabulary and stop word list based on newly data # adjust frequency terms and idf terms new_freq = [] remapped_freq = np.zeros(len(freq)) dict_ = copy.copy(pre_trained.vocabulary_) stop_list = copy.copy(pre_trained.stop_words_) max_val = len(dict_) for k in cv.vocabulary_: val = dict_.get(k, -1) if val == -1: dict_[k] = max_val existed = stop_list.discard(k) max_val += 1 new_freq.append(freq2[cv.vocabulary_[k]]) else: remapped_freq[val] = freq2[cv.vocabulary_[k]] pre_trained.vocabulary_ = dict_ pre_trained.stop_words_ = stop_list freq = freq + remapped_freq freq = np.hstack([freq, new_freq]) self.TextTransformer.N_ = self.TextTransformer.N_ + N_ freq = np.log((self.TextTransformer.N_ + 1) / (1 + freq)) + 1 pre_trained.idf_ = freq else: # train new CountVectorizer in order to expand vocabulary of the old one cv = CountVectorizer() pre_trained = self.TextTransformer.pipes[col][0]["model"] cv.set_params(**pre_trained.get_params()) pipe_ = copy.deepcopy(self.TextTransformer.pipes[col][0]) new_pipe = [] for step in pipe_.steps: if step[0] != 'model': new_pipe.append(step) else: new_pipe.append(('model', cv)) break new_pipe = Pipeline(new_pipe) new_pipe.fit(self.TextTransformer.stringify_col(X_[col])) # adjust vocabulary and stop word list based on newly data dict_ = copy.copy(pre_trained.vocabulary_) stop_list = copy.copy(pre_trained.stop_words_) max_val = len(dict_) for k in cv.vocabulary_: val = dict_.get(k, -1) if val == -1: dict_[k] = max_val existed = stop_list.discard(k) max_val += 1 pre_trained.vocabulary_ = dict_ pre_trained.stop_words_ = stop_list # get transformed data in order to adjust SVD matrix svd_ = self.TextTransformer.pipes[col][1] if isinstance(svd_, CPUTruncatedSVD): X_transformed = self.TextTransformer.pipes[col][0].transform( self.TextTransformer.stringify_col(X_[col]) ) if col in self.tf_idf: # combine saved matrix with the new one newCols = X_transformed.shape[1] - self.tf_idf[col].shape[1] if newCols > 0: newCols = np.zeros((self.tf_idf[col].shape[0], newCols)) new_tf_idf = sc.sparse.hstack([self.tf_idf[col], newCols]) else: new_tf_idf = self.tf_idf[col] new_tf_idf = sc.sparse.vstack([new_tf_idf, X_transformed]) self.tf_idf[col] = new_tf_idf # fit SVD on combined matrix new_svd = CPUTruncatedSVD() new_svd.set_params(**svd_.get_params()) new_svd.fit(self.tf_idf[col]) # replace old svd matrix with new one svd_.components_ = new_svd.components_ if append: data_ = svd_.transform(self.tf_idf[col]) data_ = self.TextTransformer.pipes[col][2].transform(data_) data_ = pd.DataFrame(data_, columns=self.TextTransformer.get_names(col, data_.shape[1])) new_data.append(data_) else: self.tf_idf[col] = X_transformed # train new SVD to get new transform matrix new_svd = CPUTruncatedSVD() new_svd.set_params(**svd_.get_params()) new_svd.fit(X_transformed) # adjust old transform matrix based on new one grad = svd_.components_ - new_svd.components_[:, :svd_.components_.shape[1]] grad = self.step * grad svd_.components_ = svd_.components_ - grad svd_.components_ = np.hstack([ svd_.components_, new_svd.components_[:, svd_.components_.shape[1]:] ]) if append: new_data = pd.concat(new_data, axis=1) if self.target is not None: y_ = np.hstack([self.target, y_]) if self.save_path: joblib.dump({ "txtTransformer": self.TextTransformer, "tf_idf": self.tf_idf, "target": y_, }, self.save_path ) return new_data, y_ result = self.TextTransformer.transform(X.to_pandas()) else: self.TextTransformer.N_ = X.shape[0] result = self.TextTransformer.fit_transform(X.to_pandas()) X_ = X.to_pandas() self.tf_idf = {} for col in self.input_feature_names: self.tf_idf[col] = self.TextTransformer.pipes[col][0].transform( self.TextTransformer.stringify_col(X_[col]) ) if self.save_path: joblib.dump({ "txtTransformer": self.TextTransformer, "tf_idf": self.tf_idf, "target": y_, }, self.save_path ) return result