コード例 #1
0
ファイル: predict_model.py プロジェクト: erees1/TextGen
class SeqInference():
    """Handles the inference loop for a single message"""
    def __init__(
        self,
        vocab_filepath,
        encoder_model,
        decoder_model,
        model_spec_file,
        data_spec_file,
        method='arg_max',
        beam_width=3,
        dictionary_dir=None,
        max_decoder_seq_length=28,
        verbose=0,
        pipeline='word2vec',
        word2vec_fpath='',
        **kwargs,
    ):

        self.data_spec_file = load_yaml(data_spec_file)
        self.model_spec_file = load_yaml(model_spec_file)
        self.encoder_model = encoder_model
        self.decoder_model = decoder_model
        self.method = method
        self.beam_width = beam_width
        self.max_decoder_seq_length = max_decoder_seq_length
        self.dictionary_dir = dictionary_dir
        self.verbose = verbose
        log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        logging.basicConfig(level=logging.INFO, format=log_fmt)
        self.logger = logging.getLogger(__name__)

        self._get_tags_from_spec()

        # Pipeline Elements
        self.int_tokenizer = IntegerTokenizer(vocab_filepath)
        rs = RemoveCharsTransformer(self.data_spec_file['punc_list'])
        ws = WhiteSpaceTokenizer()
        self.tg = Tagger()

        self.available_pipelines = ['integertokenizer', 'word2vec']
        self.pipe = Pipeline(steps=[('remove_chars',
                                     rs), ('white_space_tokenizer', ws)])

        if self.dictionary_dir is not None:
            self.pipe.append(('spelling', st))

        # self.pipe.append(('tagger', tg))

        if pipeline == 'integertokenizer':
            self.pipe.append(('integer_tokenizer', self.int_tokenizer))

        elif pipeline == 'word2vec':
            word2vec = Word2Vec()
            word2vec.set_model(
                KeyedVectors.load_word2vec_format(word2vec_fpath))
            self.pipe.append(('integer_tokenizer', self.word2vec))

        else:
            raise KeyError(
                f'Unavailablve pipeline specified, please choose one of {self.available_pipelines}'
            )

    def _log(self, message):
        if self.verbose > 1:
            self.logger.info(message)
        elif self.verbose == 1:
            print(message)

    def _get_tags_from_spec(self):
        self.START_TOKEN = self.data_spec_file['tokens']['START_TOKEN']
        self.END_TOKEN = self.data_spec_file['tokens']['END_TOKEN']
        self.START_TAG = self.data_spec_file['tags'][self.START_TOKEN]
        self.END_TAG = self.data_spec_file['tags'][self.END_TOKEN]

    def predict_response_from_text(self, message):
        reverse = self.model_spec_file['tf_dataset_params']['reverse_context']
        self._log(f'Tokenizing mesage: {message}')
        input_vectors = np.squeeze(self.process_message([message], reverse))
        output_tokens = self._predict_response_from_tokens(input_vectors)
        response = " ".join(
            self.int_tokenizer.inverse_transform(output_tokens))
        response = np.squeeze(self.tg.inverse_transform([response]))
        return response

    def process_message(self, input_string, reverse):
        """turn string into a tokenized array"""

        # Settings
        input_as_int = self.pipe.transform(input_string)
        if reverse:
            input_as_int = input_as_int[::-1]
        return np.asarray(input_as_int)

    def strip_tags_from_text(self, message):
        tag_match = re.compile('<[a-z]{3}>')
        message = tag_match.sub('', message).strip()
        message = re.sub(' +', ' ', message)
        return message

    def _predict_response_from_tokens(self, context):

        self._log(f'Sending context: {context} to encoder')
        # Encoded the context (messages up to this point)
        encoder_states = get_encoder_state(self.encoder_model, context)
        # Inital value of target sequence is the start sequence tag
        start_token = np.array(self.START_TOKEN)
        # First state for the decoder is the encoder states
        states = encoder_states
        if self.method == 'arg_max':
            decoded_tokens = self._argmax_loop(start_token, encoder_states)
        elif self.method == 'beam_search':
            decoded_tokens = self._beam_search_loop(start_token,
                                                    encoder_states)

        return decoded_tokens

    def _argmax_loop(self, target_seq, states):
        stop_condition = False
        decoded_tokens = []

        while not stop_condition:
            output_tokens, states = self._predict_next_char(target_seq, states)

            # Collapse the output tokens to a single vector
            probs = tf.squeeze(output_tokens)
            sampled_index, p = argmax_select(probs)
            sampled_word = self.int_tokenizer.inverse_transform(sampled_index)
            sampled_vector = self.pipe.transform(sampled_word)

            target_seq = tf.convert_to_tensor([[sampled_vector]])

            # Exit condition: either hit max length or find stop character.
            if (sampled_index == self.END_TAG
                    or len(decoded_tokens) > self.max_decoder_seq_length):
                stop_condition = True
            else:
                decoded_tokens.append(sampled_index)

        return decoded_tokens

    def _beam_search_loop(self, inital_target_seq, encoder_states):
        beam_width = self.beam_width
        stop_condition = False
        decoded_tokens = []
        vocab_length = len(self.tokenizer.vocab)

        # prob beam sequence keeps track of P(seq)for each beam and is updated at each iter
        log_prob_beam_seq = np.log(np.ones((beam_width, 1)))
        # prob_char_given_prev tracks conditional probability of all characters given the previous
        log_prob_char_given_prev = np.empty((beam_width, vocab_length))

        beam_seq = np.empty((beam_width, 1), dtype=np.int32)
        beam_seq[:, 0] = [inital_target_seq] * 3
        beam_has_ended = [False] * beam_width
        final_tokens = []
        final_log_probs = []

        first_char = True
        stop_condition = False

        while not stop_condition:

            if first_char:
                decoder_output, states_values = self._predict_next_char(
                    np.asarray([inital_target_seq]), encoder_states)
                log_prob_char_given_prev[0] = np.log(
                    tf.squeeze(decoder_output).numpy())
                beam_states = [states_values] * beam_width

            else:
                for beam in range(beam_width):
                    if not beam_has_ended[beam]:
                        # Last character of the beam sequence is the input for the decoder
                        # Convert beam_seq which has integer words into vectors
                        prev_word = self.int_tokenizer.inverse_transform(
                            beam_seq[beam][-1])
                        prev_word_as_vectors = self.pipe.transform(prev_word)

                        decoder_input = np.asarray([prev_word_as_vectors])
                        decoder_output, states_values = self._predict_next_char(
                            decoder_input, beam_states[beam])
                        log_prob_char_given_prev[beam] = np.log(
                            tf.squeeze(decoder_output).numpy())
                        beam_states[beam] = states_values
                    else:
                        log_prob_char_given_prev[beam] = [-np.inf
                                                          ] * vocab_length

            # Probability of all characters
            if first_char:
                log_prob_seq = log_prob_char_given_prev[0] + log_prob_beam_seq[
                    0]
                log_prob_seq = log_prob_seq.reshape((1, -1))
            else:
                log_prob_seq = log_prob_char_given_prev + log_prob_beam_seq
                assert log_prob_seq.shape == (beam_width, vocab_length)

            # Carry forward the top beam_width
            top_n, log_p = get_top_n(log_prob_seq, beam_width)
            log_prob_beam_seq = log_p.reshape((-1, 1))

            # Add top_n to the beam_seq
            beam_states = [beam_states[i] for i in top_n[0]]
            beam_seq[:, :] = [beam_seq[:, :][i] for i in top_n[0]]
            beam_seq = np.hstack((beam_seq, top_n[1].reshape(beam_width, 1)))
            self._log(f'Current beam sequence \n {beam_seq}')

            for beam in range(beam_width):
                if beam_seq[beam, -1] == self.END_TOKEN or len(
                        beam_seq[beam, :]) >= self.max_decoder_seq_length:
                    beam_has_ended[beam] = True
                    self._log(f'Appending {beam_seq[beam]} to final tokens')
                    if len(beam_seq[beam]) > 3:
                        final_tokens.append(np.array(beam_seq[beam]))
                        final_log_probs.append(
                            np.array(log_prob_beam_seq[beam]))
                    if len(final_tokens) >= beam_width:
                        stop_condition = True

            first_char = False
            # End of while loop

        return final_tokens[np.argmax(final_log_probs)]

    def _predict_next_char(self, target_seq, states_value):
        """Take a single input and lstm state and predict the next item and state"""
        output_tokens, h, c = self.decoder_model.predict([target_seq] +
                                                         states_value)
        states_value = [h, c]
        return output_tokens, states_value
コード例 #2
0
    def fit_transform(self, X: dt.Frame, y: np.array = None, append=False):
        y_ = y
        new_data = []
        if self.loaded:
            X_ = X.to_pandas()
            N_ = len(X_)
            for col in self.input_feature_names:
                if self.TextTransformer.tf_idf:
                    # train new TfidfVectorizer in order to expand vocabulary of the old one and adjust idf terms
                    cv = TfidfVectorizer()
                    pre_trained = self.TextTransformer.pipes[col][0]["model"]
                    cv.set_params(**pre_trained.get_params())
                    pipe_ = copy.deepcopy(self.TextTransformer.pipes[col][0])
                    new_pipe = []
                    for step in pipe_.steps:
                        if step[0] != 'model':
                            new_pipe.append(step)
                        else:
                            new_pipe.append(('model', cv))
                            break
                    new_pipe = Pipeline(new_pipe)
                    new_pipe.fit(self.TextTransformer.stringify_col(X_[col]))

                    freq2 = self.inverse_idf(cv.idf_, N_)

                    freq = self.inverse_idf(
                        pre_trained.idf_,
                        self.TextTransformer.N_
                    )

                    # adjust vocabulary and stop word list based on newly data
                    # adjust frequency terms and idf terms
                    new_freq = []
                    remapped_freq = np.zeros(len(freq))
                    dict_ = copy.copy(pre_trained.vocabulary_)
                    stop_list = copy.copy(pre_trained.stop_words_)
                    max_val = len(dict_)

                    for k in cv.vocabulary_:
                        val = dict_.get(k, -1)
                        if val == -1:
                            dict_[k] = max_val
                            existed = stop_list.discard(k)
                            max_val += 1
                            new_freq.append(freq2[cv.vocabulary_[k]])
                        else:
                            remapped_freq[val] = freq2[cv.vocabulary_[k]]

                    pre_trained.vocabulary_ = dict_
                    pre_trained.stop_words_ = stop_list

                    freq = freq + remapped_freq
                    freq = np.hstack([freq, new_freq])

                    self.TextTransformer.N_ = self.TextTransformer.N_ + N_
                    freq = np.log((self.TextTransformer.N_ + 1) / (1 + freq)) + 1
                    pre_trained.idf_ = freq

                else:
                    # train new CountVectorizer in order to expand vocabulary of the old one
                    cv = CountVectorizer()
                    pre_trained = self.TextTransformer.pipes[col][0]["model"]
                    cv.set_params(**pre_trained.get_params())
                    pipe_ = copy.deepcopy(self.TextTransformer.pipes[col][0])
                    new_pipe = []
                    for step in pipe_.steps:
                        if step[0] != 'model':
                            new_pipe.append(step)
                        else:
                            new_pipe.append(('model', cv))
                            break
                    new_pipe = Pipeline(new_pipe)
                    new_pipe.fit(self.TextTransformer.stringify_col(X_[col]))

                    # adjust vocabulary and stop word list based on newly data
                    dict_ = copy.copy(pre_trained.vocabulary_)
                    stop_list = copy.copy(pre_trained.stop_words_)
                    max_val = len(dict_)
                    for k in cv.vocabulary_:
                        val = dict_.get(k, -1)
                        if val == -1:
                            dict_[k] = max_val
                            existed = stop_list.discard(k)
                            max_val += 1

                    pre_trained.vocabulary_ = dict_
                    pre_trained.stop_words_ = stop_list

                # get transformed data in order to adjust SVD matrix
                svd_ = self.TextTransformer.pipes[col][1]
                if isinstance(svd_, CPUTruncatedSVD):
                    X_transformed = self.TextTransformer.pipes[col][0].transform(
                        self.TextTransformer.stringify_col(X_[col])
                    )
                    if col in self.tf_idf:
                        # combine saved matrix with the new one
                        newCols = X_transformed.shape[1] - self.tf_idf[col].shape[1]
                        if newCols > 0:
                            newCols = np.zeros((self.tf_idf[col].shape[0], newCols))
                            new_tf_idf = sc.sparse.hstack([self.tf_idf[col], newCols])
                        else:
                            new_tf_idf = self.tf_idf[col]
                        new_tf_idf = sc.sparse.vstack([new_tf_idf, X_transformed])
                        self.tf_idf[col] = new_tf_idf
                        # fit SVD on combined matrix
                        new_svd = CPUTruncatedSVD()
                        new_svd.set_params(**svd_.get_params())
                        new_svd.fit(self.tf_idf[col])

                        # replace old svd matrix with new one
                        svd_.components_ = new_svd.components_

                        if append:
                            data_ = svd_.transform(self.tf_idf[col])
                            data_ = self.TextTransformer.pipes[col][2].transform(data_)
                            data_ = pd.DataFrame(data_, columns=self.TextTransformer.get_names(col, data_.shape[1]))
                            new_data.append(data_)

                    else:
                        self.tf_idf[col] = X_transformed
                        # train new SVD to get new transform matrix
                        new_svd = CPUTruncatedSVD()
                        new_svd.set_params(**svd_.get_params())
                        new_svd.fit(X_transformed)

                        # adjust old transform matrix based on new one
                        grad = svd_.components_ - new_svd.components_[:, :svd_.components_.shape[1]]
                        grad = self.step * grad
                        svd_.components_ = svd_.components_ - grad
                        svd_.components_ = np.hstack([
                            svd_.components_,
                            new_svd.components_[:, svd_.components_.shape[1]:]
                        ])

            if append:
                new_data = pd.concat(new_data, axis=1)
                if self.target is not None:
                    y_ = np.hstack([self.target, y_])

                if self.save_path:
                    joblib.dump({
                        "txtTransformer": self.TextTransformer,
                        "tf_idf": self.tf_idf,
                        "target": y_,
                    },
                        self.save_path
                    )
                return new_data, y_

            result = self.TextTransformer.transform(X.to_pandas())

        else:

            self.TextTransformer.N_ = X.shape[0]
            result = self.TextTransformer.fit_transform(X.to_pandas())
            X_ = X.to_pandas()
            self.tf_idf = {}
            for col in self.input_feature_names:
                self.tf_idf[col] = self.TextTransformer.pipes[col][0].transform(
                    self.TextTransformer.stringify_col(X_[col])
                )

        if self.save_path:
            joblib.dump({
                "txtTransformer": self.TextTransformer,
                "tf_idf": self.tf_idf,
                "target": y_,
            },
                self.save_path
            )
        return result