Exemple #1
0
class KerasTokenizer():
    def __init__(self, vocab_size=None, oov_token="<OOV>"):
        self.vocab_size = vocab_size
        self.oov_token = oov_token

    @property
    def vocab(self):
        return self.tokenizer.word_index

    def fit(self, texts):
        self.tokenizer = Tokenizer(num_words=self.vocab_size,
                                   oov_token=self.oov_token)
        self.tokenizer.fit_on_texts(texts)

    def encode(self, text):
        if type(text) == str:
            return self.tokenizer.texts_to_sequences([text])[0]
        return self.tokenizer.texts_to_sequences(text)

    def decode(self, encoded_text):
        if not encoded_text:
            return ""
        if type(encoded_text[0]) == int:
            return self.tokenizer.sequences_to_texts([encoded_text])[0]
        return self.tokenizer.sequences_to_texts(encoded_text)

    def tokenize(self, text):
        if type(text) == str:
            return text.split()
        else:
            return [t.split() for t in text]
Exemple #2
0
class Model:
    def __init__(self, text):
        self.sliding_token_size = 15
        self.path = None
        self.model = None
        self.tokens = text.split()
        self.tokenizer = Tokenizer(filters='')
        self.tokenizer.fit_on_texts(self.tokens)
        self.x = numpy.zeros((len(self.tokens) - self.sliding_token_size, self.sliding_token_size))
        self.y = numpy.zeros((len(self.tokens) - self.sliding_token_size, 1))
        sequences = self.tokenizer.texts_to_sequences(self.tokens)
        for token_n in range(len(self.tokens) - self.sliding_token_size):
            for sliding_token_n in range(self.sliding_token_size):
                self.x[token_n][sliding_token_n] = sequences[token_n + sliding_token_n][0]
            self.y[token_n] = sequences[token_n + self.sliding_token_size][0]

    def save(self):
        pickle.dump(self.model, open(self.path + 'model.bin', 'wb'))
        pickle.dump(self.tokens, open(self.path + 'tokens.bin', 'wb'))
        pickle.dump(self.tokenizer, open(self.path + 'tokenizer.bin', 'wb'))

    def load(self):
        self.model = pickle.load(open(self.path + 'model.bin', 'rb'))
        self.tokens = pickle.load(open(self.path + 'tokens.bin', 'rb'))
        self.tokenizer = pickle.load(open(self.path + 'tokenizer.bin', 'rb'))

    def generate(self):
        text = random.choice(self.tokens)
        for _ in range(100):
            sequences = pad_sequences([self.tokenizer.texts_to_sequences([text])[0]], self.sliding_token_size)
            text += ' ' + self.tokenizer.sequences_to_texts([self.model.predict_classes(sequences)])[0]
        return text
Exemple #3
0
def get_result(max_length):
    x_test = get_text("data/processed" + "/X_test.txt")
    x_train = get_text("data/processed" + "/X_train.txt")
    y_test = np.loadtxt("data/processed" + "/y_test.txt", dtype=int)
    y_train = np.loadtxt("data/processed" + "/y_train.txt", dtype=int)
    # word index from 1
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(x_train)
    x_train_cut_num = tokenizer.texts_to_sequences(x_train)
    x_test_cut_num = tokenizer.texts_to_sequences(x_test)
    x_train_cut_num_pad = pad_sequences(x_train_cut_num,
                                        padding="post",
                                        maxlen=max_length,
                                        value=4)
    x_test_cut_num_pad = pad_sequences(x_test_cut_num,
                                       padding="post",
                                       maxlen=max_length,
                                       value=4)
    x_train_cut_text = tokenizer.sequences_to_texts(x_train_cut_num_pad)
    x_test_cut_text = tokenizer.sequences_to_texts(x_test_cut_num_pad)

    nb_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                            ('clf',
                             MultinomialNB(fit_prior=True, class_prior=None))])
    nb_total = 0
    nb_result_list = []
    for i in range(y_test.shape[1]):
        nb_pipeline.fit(x_train_cut_text, y_train[:, i])
        nb_predict = nb_pipeline.predict(x_test_cut_text)
        nb_result_list.append(nb_predict)
        nb_total += np.sum(
            [y_test[j, i] == nb_predict[j] for j in range(y_test.shape[0])])
    nb_result_reshape = np.array(nb_result_list).reshape(
        y_test.shape[0], y_test.shape[1])
    total_num = y_test.shape[0] * y_test.shape[1]
    print("navie bayes accuracy: ")
    print(nb_total / total_num)
    print("F1 score: ")
    print(
        precision_recall_fscore_support(y_test,
                                        nb_result_reshape,
                                        average='macro'))
    print("roc score: ")
    print(roc_auc_score(y_test, nb_result_reshape))
Exemple #4
0
class Preprocessor:
    def __init__(self, cache_path=None, stop_words=None, **extra):
        if cache_path and os.path.exists(cache_path):
            with open(cache_path, 'r') as f:
                self._tk = tokenizer_from_json(f.read())
        else:
            self._tk = Tokenizer(lower=True, **extra)

        self._cache_path = cache_path

    def fit(self, data):
        self._tk.fit_on_texts(data)

    def save(self):
        filename = self._cache_path
        with open(filename, 'w') as f:
            f.write(self._tk.to_json())

    def transform(self, data: pd.Series, truncate: Union[str, int] = 'median'):
        """Transform a list of Series of texts into a list of Series of vectors"""
        seq = self._tk.texts_to_sequences(data)

        lens = [len(vec) for vec in seq]
        logging.info(
            f'median {np.median(lens)}, mean {np.mean(lens)}, max {np.max(lens)}, min {np.min(lens)}'
        )

        if truncate == 'median':
            text_len = int(np.median(lens))
        else:
            text_len = truncate

        logging.info(f'Transforming texts into vectors with {text_len} size')

        return pd.Series(
            pad_sequences(seq, padding='post', maxlen=text_len).tolist())

    def to_text(self, data):
        """Transform a vector back to text

        Arguments:
            data {list} -- ndarray or pd.Series
        """

        return self._tk.sequences_to_texts(data)
Exemple #5
0
    def build_co_occurence(self, word_index, corpus, window_size):

        # Cleaning the corpus
        tk = Tokenizer()
        tk.fit_on_texts(corpus)
        corpus = tk.texts_to_sequences(corpus)
        corpus = tk.sequences_to_texts(corpus)

        vocab_size = len(word_index) + 1

        idx_to_word = {word_index[word]:word for word in word_index if word in word_index}

        # Collecting indices as a sparse matrix
        self.cooccurences = sparse.lil_matrix((vocab_size, vocab_size), dtype = np.float64)
        print(f"Shape of coocc = {self.cooccurences.shape}")
        # Get the tokenized sequence * TODO implement with tokenizer
        for i, line in enumerate(corpus):

            # TODO add progress bar
            print(f"\rForming the Co-Occurence Matrix : {(100*(i+1 )/len(corpus)):0.2f}%", end = "")
            sys.stdout.flush()

            tokens = line.strip().split()
            token_ids = [word_index[word.lower()] for word in tokens if word.lower() in word_index]

            # extracting context words to the left

            for center_i, center_id in enumerate(token_ids):
                context_ids = token_ids[max(0, center_i - window_size): center_i]
                contexts_len = len(context_ids)

                # Adding to the coocc matrix

                for left_i, left_id in enumerate(context_ids):
                    dist = contexts_len - left_i
                    inc = 1/float(dist)

                    self.cooccurences[center_id, left_id] += inc
                    self.cooccurences[left_id, center_id] += inc
        
        print()
        print(f"Generated co-occurence matrix of shape {self.cooccurences.shape}")
        return self.cooccurences
Exemple #6
0
        seed_text_rtl = " ".join(word for word in ngram[1])
        print(seed_text_ltr, "->", current_word, "->", seed_text_rtl)

        token_list = tokenizer.texts_to_sequences([seed_text_ltr])[0]
        token_list = pad_sequences([token_list],
                                   maxlen=max_sequence_len - 1,
                                   padding='pre')

        token_list_rev = tokenizer.texts_to_sequences([seed_text_rtl])[0]
        token_list_rev = pad_sequences([token_list_rev],
                                       maxlen=max_sequence_len - 1,
                                       padding='pre')

        predicted_id = np.argmax(model.predict([token_list, token_list_rev]),
                                 axis=-1)
        predicted_word = tokenizer.sequences_to_texts([predicted_id])[0]
        print(predicted_word)

        predicted_probs = model.predict([token_list, token_list_rev])
        predicted_best = np.argsort(-predicted_probs, axis=-1)[0][:4500]

        suggestions = []
        correct = None

        for prob in predicted_best:
            output_word = tokenizer.sequences_to_texts([[prob]])[0]
            ed = nltk.edit_distance(current_word, output_word)

            if ed == 0:
                print("I got this one; it seems correct -->", current_word,
                      "=", output_word)
    def map_to_string(cls, input_vector: List, tokenizer: Tokenizer) -> List:
        '''
        Map a given vector to an unpadded string.
        '''

        return tokenizer.sequences_to_texts(input_vector)
Exemple #8
0
nlp = spacy.load('en_core_web_lg')

with open('./data/great_expectation.txt', 'r', encoding='utf-8-sig') as f:
    text_chunks = f.read().replace('\n', ' ').split('.')

LINES = 1000
LINES = len(text_chunks)

for i in tqdm(range(LINES)):
    #line = tokenize_line(text_chunks[i])
    line = text_chunks[i].strip().lower()
    line = re.sub(F, ' ', line)
    if len(line) <= 0:
        continue
    line = tokenize_line(line)
    tokenizer.fit_on_texts(line)
    seq = tokenizer.texts_to_sequences(line)
    txt = tokenizer.sequences_to_texts(seq)
    toks = nlp(' '.join(txt))
    for s, t in zip(seq, toks):
        print(t, t.pos, s[0])

    exit()
with open('./tokenizer.pickle', 'wb') as f:
    pickle.dump(tok, f)

print('word count: ', len(tok.word_counts))
print(tok.word_counts)
print('count = ', count)
Exemple #9
0
                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + additional_filters,
                  lower=True,
                  split=" ",
                  char_level=False,
                  oov_token="UNK",
                  document_count=0)
token.fit_on_texts(sentence_data)

tokenizer_config = token.get_config()
print(tokenizer_config.keys())

#print(tokenizer_config["word_index"])

#print("\n\n\n\n\n\n\n")

import json
word_counts = json.loads(tokenizer_config['word_counts'])
#print(word_counts)
print(word_counts["the"])

index_word = json.loads(tokenizer_config['index_word'])
word_index=json.loads(tokenizer_config["word_index"])
#print(sentence_data)

print(sentence_data[:5])

sentence_seq = token.texts_to_sequences(sentence_data)
print(sentence_seq[0:5])

senetn = token.sequences_to_texts(sentence_seq)
print(senetn[:5])
Exemple #10
0
class Pungen:
    def __init__(self, **kwargs):
        self.filepath = kwargs.get('filepath')
        self.embedding_layer = None

    def _parse_corpus(self, min_seq_len, filepath):
        print('Indexing word vectors.')
        self.texts = []
        with open(filepath, encoding='utf-8') as fp:
            for line in fp:
                if line == "\n":
                    continue
                self.texts.append(line)

        self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,
                                   filters=TOKEN_FILTER)
        self.tokenizer.fit_on_texts(self.texts)
        self.sequences = self.tokenizer.texts_to_sequences(self.texts)
        self.sequences = [x for x in self.sequences if len(x) >= min_seq_len]
        self.word_index = self.tokenizer.word_index
        print('Found %s unique tokens.' % len(self.word_index))

        print('Found %s texts.' % len(self.sequences))

    def prepare_emb(self, emb_dim, input_length):
        print('Indexing word vectors.')

        emb_name = 'glove.6B.' + str(emb_dim) + "d.txt"

        self.embeddings_index = {}
        with open(os.path.join(GLOVE_DIR, emb_name), encoding='utf-8') as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, 'f', sep=' ')
                self.embeddings_index[word] = coefs

        print('Found %s word vectors.' % len(self.embeddings_index))
        # prepare embedding matrix
        num_words = MAX_NUM_WORDS
        self.embedding_matrix = np.zeros((num_words, emb_dim))
        for word, i in self.word_index.items():
            if i >= num_words:
                continue
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector

        # load pre-trained word embeddings into an Embedding layer
        # note that we set trainable = False so as to keep the embeddings fixed
        self.embedding_layer = Embedding(num_words,
                                         emb_dim,
                                         embeddings_initializer=Constant(
                                             self.embedding_matrix),
                                         input_length=input_length,
                                         trainable=False)

    def check_generator(self):
        texts = self.tokenizer.sequences_to_texts(self.sequences)

        if len(texts) != len(self.texts):
            print("Different sizes of texts")
            return

        filter = set(TOKEN_FILTER)

        for i in range(len(texts)):
            if texts[i].lower() != self.texts[i][:-1].lower():

                if any((c in filter) for c in self.texts[i][:-1].lower()):
                    continue

                print(texts[i], self.texts[i][:-1])
                print(self.texts[i][:-1].lower())
                print("Tokenizer failed to tokenize properly!")
                return

        print("Tokenizer check was succesfull!")

    def form_pun(self, eval_path):
        retrieve = Retrieve(sentence_path=TEXT_DATA_DIR + TEXT_DATA,
                            pun_path=PUN_DATA_DIR + PUN_DATA)
        (pun, sentence, score) = retrieve.retrieve()

        if not sentence:
            print("No sentence with word {} was found. Exiting...".format(
                pun[1]))
            raise Exception()

        text = word_tokenize(sentence)
        tokenized = nltk.pos_tag(text)

        print(tokenized)
        print(sentence, pun[0], pun[1])
        pre = self.tokenizer.texts_to_sequences([sentence])
        wp = self.tokenizer.texts_to_sequences([pun[0]])
        wa = self.tokenizer.texts_to_sequences([pun[1]])

        if (not wa[0]) or (not wp[0]):
            print(
                "The pair of pun and word does not exist in the parsed corpus. Exit..."
            )
            raise Exception()

        index_wa = -1
        for seq in pre[0]:
            index_wa = index_wa + 1
            if seq == wa[0][0]:
                pre[0][index_wa] = wp[0][0]
                break

        wordsimilarity = WordSimilarity()
        wordsimilarity.word2vec()
        wordsimilarity.load()

        try_limit = 5
        try_count = 0
        index_topic = 0
        while True:
            try:
                topic_word = None
                for i in range(index_topic, len(tokenized)):
                    (word, pos) = tokenized[i]
                    if (pos == 'NNP'):
                        topic_word = "man"
                        print(word, pos)
                        index_topic = index_topic + 1
                        break

                    if (pos == 'NN') or (pos == 'PRP') or (pos == 'NNS') or (
                            pos == 'PRP$'):
                        topic_word = word
                        print(word, pos)
                        index_topic = index_topic + 1
                        break
                    index_topic = index_topic + 1

                result = wordsimilarity.getSimilar([topic_word, pun[0]],
                                                   [pun[1]], 10)
                other_result = wordsimilarity.getSimilar([pun[0]], [], 10)

                break
            except KeyError:
                print("Word {} is not in vocabulary, try with the next one".
                      format(topic_word))
                try_count = try_count + 1
                if try_limit == try_count:
                    print("Limit of trys has been reached. Exit...")
                    raise Exception()

        eval_surprisal = Evaluate()
        eval_surprisal.load_model(eval_path)

        finals = []
        mean_amalgam = 0
        for (word, prob) in result:
            swap = self.tokenizer.texts_to_sequences([word])

            context_window = 2
            surprise = eval_surprisal.compute_surpisal(
                sentence=pre[0],
                pun_word=wa[0][0],
                pun_alternative=wp[0][0],
                context_window=context_window)
            mean_amalgam = mean_amalgam + surprise
            print(surprise)

            pre[0][index_topic] = swap[0][0]

            post_simple = self.tokenizer.sequences_to_texts([pre[0]])
            print(post_simple)

            pre[0][index_topic + 1] = 0
            if index_topic >= 2:
                pre[0][index_topic - 1] = 0

            post_smoothing = self.dac.inference(pre[0])
            post_smoothing = self.tokenizer.sequences_to_texts(
                post_smoothing.tolist())
            finals.append(post_smoothing)
            print(post_smoothing)
        print(finals)
        print(mean_amalgam / 10)

        other_finals = []
        mean_similar = 0
        for (word, prob) in other_result:
            swap = self.tokenizer.texts_to_sequences([word])

            context_window = 2
            surprise = eval_surprisal.compute_surpisal(
                sentence=pre[0],
                pun_word=wa[0][0],
                pun_alternative=wp[0][0],
                context_window=context_window)
            mean_similar = mean_similar + surprise
            print(surprise)

            pre[0][index_topic] = swap[0][0]

            post_simple = self.tokenizer.sequences_to_texts([pre[0]])
            print(post_simple)

            pre[0][index_topic + 1] = 0
            if index_topic >= 2:
                pre[0][index_topic - 1] = 0

            post_smoothing = self.dac.inference(pre[0])
            post_smoothing = self.tokenizer.sequences_to_texts(
                post_smoothing.tolist())
            other_finals.append(post_smoothing)
            print(post_smoothing)
        print(other_finals)
        print(mean_similar / 10)

        return finals.extend(other_finals)

    def train_predict_model(self, model_params):
        predict_word = WordPredict(max_len=MAX_LEN,
                                   max_words=MAX_NUM_WORDS,
                                   emb_layer=self.embedding_layer)
        predict_word.build_model(**model_params)
        predict_word.compile_model(model_params)

        generator = Generator(sequences=self.sequences,
                              batch_size=PREDICT_BS,
                              max_words=MAX_NUM_WORDS,
                              max_len=MAX_LEN,
                              split=PREDICT_SPLIT)

        predict_word.train(generator, PREDICT_BS, PREDICT_SPLIT,
                           PREDICT_EPOCHS)
        return predict_word

    def load_predict_model(self, path):
        predict_word = load_model(path)
        return predict_word

    def train_dac_model(self, model_params):
        dac = DAC()
        smoother_model = dac.build_model(hidden_sizes=[64, 64],
                                         seq_len=50,
                                         no_words=40000,
                                         emb_layer=self.embedding_layer,
                                         lr=0.01)
        generator = Generator(sequences=self.sequences,
                              batch_size=SMOOTH_BS,
                              max_words=MAX_NUM_WORDS,
                              max_len=MAX_LEN,
                              split=SMOOTH_SPLIT)
        smoother_model = dac.train(generator,
                                   full_model=smoother_model,
                                   model_params=model_params,
                                   bs=SMOOTH_BS,
                                   split=SMOOTH_SPLIT,
                                   pretrain_epochs=4,
                                   epochs=SMOOTH_EPOCHS)

    def run(self, predict_path, smoother_path, eval_path):
        self._parse_corpus(MIN_SEQ_LEN, TEXT_DATA_DIR + TEXT_DATA)
        self.prepare_emb(EMBEDDING_DIM, MAX_LEN)

        predict_model = None
        if predict_path is None:
            model_params = {
                'lstm': [16],
                'merge_layer': 'concat',
                'dense': {
                    'size': [64, 32],
                    'act': 'elu',
                    'dropout': 0
                },
                'optimizer': 'adam',
                'lr': 0.0005
            }
            predict_model = self.train_predict_model(model_params)
        else:
            pass
            #predict_model = self.load_predict_model(predict_path)

        #smoother_model = None
        if smoother_path is None:
            model_params = {'size': [64, 64], 'lr': 0.01}
            #smoother_model = self.train_dac_model(model_params)
        else:
            self.dac = DAC()
            self.dac.load_model(smoother_path)

        #GENERATE PUN
        while True:
            try:
                final = pungen.form_pun(eval_path)
                break
            except Exception:
                pass

        print(final)
class DDTokenizer:
    def __init__(self, num_words, oov_token='<UNK>'):
        self.tokenizer = Tokenizer(num_words=num_words,
                                   oov_token=oov_token,
                                   filters='!"#$%&*+,-./:;<>?\\^_`{|}~\t\n',
                                   char_level=True,
                                   lower=False)
        self.has_trained = False

        self.pad_type = 'post'
        self.trunc_type = 'post'

        # The encoded data
        self.word_index = {}

    def fit(self, train_data):
        # Get max training sequence length
        print("Training Tokenizer...")
        self.tokenizer.fit_on_texts(train_data)
        self.has_trained = True
        print("Done training...")

        # Get our training data word index
        self.word_index = self.tokenizer.word_index

    def encode(self,
               data,
               use_padding=True,
               padding_size=None,
               normalize=False):
        # Encode training data sentences into sequences
        train_sequences = self.tokenizer.texts_to_sequences(data)

        # Get max training sequence length if there is none passed
        if padding_size is None:
            maxlen = max([len(x) for x in train_sequences])
        else:
            maxlen = padding_size

        if use_padding:
            train_sequences = pad_sequences(train_sequences,
                                            padding=self.pad_type,
                                            truncating=self.trunc_type,
                                            maxlen=maxlen)

        if normalize:
            train_sequences = np.multiply(1 / len(self.tokenizer.word_index),
                                          train_sequences)

        return train_sequences

    def pad(self, data, padding_size=None):
        # Get max training sequence length if there is none passed
        if padding_size is None:
            padding_size = max([len(x) for x in data])

        padded_sequence = pad_sequences(data,
                                        padding=self.pad_type,
                                        truncating=self.trunc_type,
                                        maxlen=padding_size)

        return padded_sequence

    def decode(self, array):
        assert self.has_trained, "Train this tokenizer before decoding a string."
        return self.tokenizer.sequences_to_texts(array)

    def test(self, string):
        encoded = list(self.encode(string)[0])
        decoded = self.decode(self.encode(string))

        print("\nEncoding:")
        print("{original} -> {encoded}".format(original=string[0],
                                               encoded=encoded))
        print("\nDecoding:")
        print("{original} -> {encoded}".format(original=encoded,
                                               encoded=decoded[0].replace(
                                                   " ", "")))

    def get_info(self):
        return self.tokenizer.index_word

if __name__ == '__main__':
    shakespeare_url = "https://homl.info/shakespeare"
    filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
    with open(filepath) as f:
        shakespeare_text = f.read()

    # Then Encode char to integer
    # Use Tokenizer class: allow to vectorize text corpus by tuning each text to sequence of integer
    # or into a vector
    tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
    tokenizer.fit_on_texts([shakespeare_text])

    tokenizer.texts_to_sequences(["First"])
    tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

    max_id = len(tokenizer.word_index)  # number of distinct chars
    # dataset_size = tokenizer.document_count  # total number of chars

    [encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
    dataset_size = encoded.shape[0]
    train_size = dataset_size * 90 // 100
    # Slice for get 90% to dataset
    a = encoded[:train_size]
    dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

    n_steps = 100
    window_length = n_steps + 1  # target = input shifted 1 character ahead
    # Window method create several windows with length = 101, 1st window contain 0 -> 100
    # second one contain 1-> 101, then flatten all of window
Exemple #13
0
class CrimData:
    def __init__(self, args):
        # embeddings model
        self.w2v = args['w2v']

        # training tuples
        train = args['train']
        # test tuples
        test = args['test']
        # validation tuples
        validation = args['validation']
        # synonyms
        self.synonyms = args['synonyms']

        # if set to -1 then we will use full vector space vocab
        # otherwise use indicated size
        self.limited_vocab_n = args['limited_vocab_n']
        if self.limited_vocab_n > -1:
            print("Creating limited vocabulary of %d" % (self.limited_vocab_n))
            # collect words for exercise
            flat_synonym = [word for v in self.synonyms.values() for word in v]
            hyponyms = list(set([x for x, y in train + test + validation]))
            hypernyms = list(set([y for x, y in train + test + validation]))

            # dataset set vocab
            vocab = list(set(hyponyms + hypernyms + flat_synonym))
            vocab_len = len(vocab)
            print("Dataset vocabulary size is %d" % (vocab_len))
            model_words = list(self.w2v.vocab.keys())
            # sample words from vector space; sample more words than requested to handle collisions with dataset words
            random_words = np.random.choice(model_words,
                                            (self.limited_vocab_n + 10000),
                                            replace=False)
            vocab = vocab + [
                w for w in random_words.tolist() if w not in vocab
            ][:self.limited_vocab_n - vocab_len]
            print("Truncated vocab length is %d" % (len(vocab)))
        else:
            # choose all words in vector space
            vocab = list(self.w2v.vocab.keys())

        # create tokenizer from embeddings model
        self.tokenizer = Tokenizer(filters='', lower=False)
        # fit on vocab
        self.tokenizer.fit_on_texts(vocab)
        print("Vocab size is %d words" % (len(self.tokenizer.index_word)))
        # initialise negative word sampler
        print("Initialising negative sampler")
        self.negative_sampler = make_sampler(
            list(self.tokenizer.word_index.values()))
        print("Tokenising all dataset tuples")
        # tokenize dataset -> convert to numbers which will serve as embeddings lookup keys
        self.all_data_token = self.tokenizer.texts_to_sequences(
            [[x, y] for x, y in train + test + validation])

        # create hypernym dictionary lookup
        self.hypernym_id_lookup = defaultdict(list)
        for x, y in self.all_data_token:
            self.hypernym_id_lookup[x].append(y)
        # disable default factory
        self.hypernym_id_lookup.default_factory = None

        print("Creating embeddings matrix")
        # create embeddings matrix
        self.embeddings_matrix = np.zeros(
            (len(self.tokenizer.index_word) + 1, 300))
        for k, v in self.tokenizer.index_word.items():
            self.embeddings_matrix[k] = self.w2v[v]
            #vectors should already by nornalised
            #self.embeddings_matrix[k] /= np.linalg.norm(emb_matrix[k])
        print("Done!")

    # get list of padded synonyms
    def sample_synonyms(self, word_id, sample_length):
        # convert word_id to word to look for in synyony dictionary
        word = self.tokenizer.index_word[word_id]

        if word in self.synonyms:
            _syn = self.synonyms[word]
        else:
            _syn = []

        # convert list to embeddings index array
        syn_list = np.asarray(self.tokenizer.texts_to_sequences([_syn])[0])
        result = np.asarray([])
        # if we have enough synonyms, we can randomly sample length-1 from list and add the hyponym itself to
        # the list
        if (sample_length > 1 and len(syn_list) >= (sample_length - 1)):
            result = np.random.choice(syn_list,
                                      sample_length - 1,
                                      replace=False)
            result = np.append(result, word_id)
        # otherwise, we pick all synyonyms and pad the sequences to match model fixed-input
        else:
            result = np.append(syn_list, word_id)
            result = pad_sequences([result],
                                   sample_length,
                                   padding='post',
                                   value=word_id)

        # we're expecting 1-D vector
        return result.flatten()

    def get_negative_random(self, word_id, neg_count):
        neg_samples = []
        while len(neg_samples) < neg_count:
            tmp_neg = next(self.negative_sampler)
            if tmp_neg not in self.hypernym_id_lookup[word_id]:
                neg_samples.append(tmp_neg)

        return neg_samples

    def get_augmented_batch(self, query_batch, neg_count, syn_count):

        # create synonym equivalent in ids, prepending the hyponym to the list of synonyms
        query_input = np.zeros((len(query_batch) * (neg_count + 1), 1),
                               dtype='int32')
        hyper_input = np.zeros((len(query_batch) * (neg_count + 1), 1),
                               dtype='int32')
        synonym_input = np.zeros(
            (len(query_batch) * (neg_count + 1), syn_count), dtype='int32')
        y_input = np.zeros(len(query_batch) * (neg_count + 1))

        for idx, (query, hyper) in enumerate(query_batch):
            query_input[idx * (neg_count + 1)] = np.asarray(query)
            hyper_input[idx * (neg_count + 1)] = np.asarray(hyper)
            synonym_input[idx * (neg_count + 1)] = self.sample_synonyms(
                query, syn_count)
            y_input[idx * (neg_count + 1)] = 1

            if neg_count > 0:
                negatives = self.get_negative_random(word_id=query,
                                                     neg_count=neg_count)
                for m, neg in enumerate(negatives):
                    query_input[(idx * (neg_count + 1)) +
                                (m + 1)] = np.asarray(query)
                    hyper_input[(idx * (neg_count + 1)) +
                                (m + 1)] = np.asarray(neg)
                    synonym_input[(idx * (neg_count + 1)) +
                                  (m + 1)] = self.sample_synonyms(
                                      query, syn_count)

        return query_input, hyper_input, synonym_input, y_input

    def token_to_words(self, dataset):
        _q = self.tokenizer.sequences_to_texts(dataset[:, 0].reshape(-1, 1))
        _h = self.tokenizer.sequences_to_texts(dataset[:, 1].reshape(-1, 1))

        return list(zip(_q, _h))
Exemple #14
0
    # Tokenizer
    X_tokenizer = Tokenizer(filters=args.filters,
                            lower=args.lower,
                            char_level=args.char_level,
                            oov_token='<UNK>')
    X_tokenizer.fit_on_texts(X_train)
    vocab_size = len(X_tokenizer.word_index) + 1  # +1 for padding token
    config.logger.info(f"→ vocab_size: {vocab_size}")

    # Convert texts to sequences of indices
    original_text = X_train[0]
    X_train = np.array(X_tokenizer.texts_to_sequences(X_train))
    X_val = np.array(X_tokenizer.texts_to_sequences(X_val))
    X_test = np.array(X_tokenizer.texts_to_sequences(X_test))
    preprocessed_text = X_tokenizer.sequences_to_texts([X_train[0]])[0]
    config.logger.info("→ Text to indices:\n"
                       f"  (raw) → {original_text}\n"
                       f"  (preprocessed) → {preprocessed_text}\n"
                       f"  (tokenized) → {X_train[0]}")

    # Label encoder
    y_tokenizer = LabelEncoder()
    y_tokenizer = y_tokenizer.fit(y_train)
    classes = y_tokenizer.classes_
    config.logger.info("→ classes:\n" f"  {classes}")

    # Convert labels to tokens
    class_ = y_train[0]
    y_train = y_tokenizer.transform(y_train)
    y_val = y_tokenizer.transform(y_val)
Exemple #15
0
# sequences greater than 100 in length will be truncated
MAX_SEQ_LENGTH = 100
X_padded = pad_sequences(X_encoded,
                         maxlen=MAX_SEQ_LENGTH,
                         padding="pre",
                         truncating="post")
Y_padded = pad_sequences(Y_encoded,
                         maxlen=MAX_SEQ_LENGTH,
                         padding="pre",
                         truncating="post")
# print the first sequence
print(X_padded[0], "\n" * 3)
print(Y_padded[0])
X, Y = X_padded, Y_padded

Y = to_categorical(Y)

TEST_SIZE = 0.10
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=TEST_SIZE,
                                                    random_state=4)
# Re-evaluate the model
# loss, acc = model.evaluate(X_test,Y_test, verbose=2)

input = X_padded[0].reshape((1, 100))

print(tag_tokenizer.sequences_to_texts(np.argmax(model.predict(input),
                                                 axis=2)))
print(tag_tokenizer.sequences_to_texts(Y_padded)[0])
Exemple #16
0
def vschat_service(request):
    # text 입력 받은 후
    if request.method == 'POST':

        # input1 받아옴 + 모델 탑재하고 라벨과 쿼리 받아오기
        input1 = request.POST['input1']

        okt = Okt()

        max_len = 40

        vocab_size = 515
        tokenizer = Tokenizer()

        with open('./static/word_dict_ver03.json') as json_file:
            word_index = json.load(json_file)
            tokenizer.word_index = word_index

        # print(tokenizer.word_index)

        tokenized_sentence = []
        temp_X = okt.morphs(input1, stem=True)  # 토큰화
        tokenized_sentence.append(temp_X)
        print(tokenized_sentence)

        input_data = tokenizer.texts_to_sequences(tokenized_sentence)
        print(input_data)

        input_data = pad_sequences(input_data, maxlen=max_len)  # padding

        loaded_model = load_model('./static/best_model_ver_relu_epc500.h5')
        prediction = loaded_model.predict(input_data)
        print(prediction)
        print("label: ", np.argmax(prediction[0]))

        label = str(np.argmax(prediction[0]))

        if label == '1':
            query = "select * from stepcountData where saved_time BETWEEN date('now', '-7 days', 'localtime') AND date('now', 'localtime');"
        elif label == '2':
            query = "select * from stepcountData where saved_time BETWEEN date('now', '-35 days',  'localtime') AND date('now', 'localtime');"
        elif label == '3':
            query = "select * from stepcountData where saved_time BETWEEN date('now', '-4 months','start of month', 'localtime') AND date('now', '+1 days', 'localtime');"
        else:

            with open('./static/tokenizer_for_attention.json') as f:
                data = json.load(f)
                tokenizer = tokenizer_from_json(data)

            # 모델 생성
            model = Seq2seq(sos=tokenizer.word_index['\t'],
                            eos=tokenizer.word_index['\n'])

            model.load_weights("./static/attention_ckpt/attention_ckpt")

            # Implement algorithm test
            @tf.function
            def test_step(model, inputs):
                return model(inputs, training=False)

            tmp_seq = [" ".join(okt.morphs(input1))]
            print("tmp_seq : ", tmp_seq)

            test_data = list()
            test_data = tokenizer.texts_to_sequences(tmp_seq)
            print("tokenized data : ", test_data)

            prd_data = tf.keras.preprocessing.sequence.pad_sequences(
                test_data, value=0, padding='pre', maxlen=128)

            prd_data = tf.data.Dataset.from_tensor_slices(prd_data).batch(
                1).prefetch(1024)

            for seq in prd_data:
                prediction = test_step(model, seq)

                predicted_seq = tokenizer.sequences_to_texts(
                    prediction.numpy())
                print(predicted_seq)
                print("predict tokens : ", prediction.numpy())

            predicted_seq = str(predicted_seq[0]).replace(" _ ", "_")
            predicted_seq = predicted_seq.replace("e (", "e(")
            predicted_seq = predicted_seq.replace("' ", "'")
            predicted_seq = predicted_seq.replace(" '", "'")
            predicted_seq = predicted_seq.replace(" - ", "-")
            predicted_seq = predicted_seq.replace("+ ", "+")
            predicted_seq = predicted_seq.replace("- ", "-")
            print(predicted_seq)
            query = "select * from stepcountData where " + predicted_seq + ";"

        # if not legend_value or xValue or yValue or response:
        legend_value.clear()
        xValue.clear()
        yValue.clear()
        response.clear()
        x1.clear()
        x2.clear()
        y1.clear()
        y2.clear()

        print(legend_value, xValue, yValue, response, x1, x2, y1, y2)

        try:
            if label == "2":
                show_weeks_avg(query)
                print("주별 평균")
            elif label == "3":
                show_months_avg(query)
                print("월별 평균")
            elif label == "6":
                if check_week(query) == True:
                    show_by_week(query)
                    print('주별비교')
                else:
                    show_by_month(query)
                    print('월별비교')

            else:
                show_barchart(query)
                print('바차트')
        except HTTPError as e:
            print("httperror")
            print("데이터를 불러올 수 없습니다. 텍스트를 다시 입력하세요")
        except IndexError as e:
            print("indexerror")
            print("데이터를 불러올 수 없습니다. 텍스트를 다시 입력하세요")
        # 예외처리에 대한 알림 메세지 어떻게 출력하는지 보기

        # 딕셔너리에 저장(응답, 쿼리 결과 저장 변수, 라벨)
        output = dict()
        if not output:
            # output['response'] = response
            output['response'] = "그래프가 출력되었습니다"
            output['xValues'] = xValue
            output['yValues'] = yValue
            output['label'] = label
            output['legend_value'] = legend_value
            print(output)

        else:
            del output
            output['response'] = response
            utput['response'] = "그래프가 출력되었습니다"
            output['xValues'] = xValue
            output['yValues'] = yValue
            output['label'] = label
            output['legend_value'] = legend_value
            print(output)

        print("-----------------------------------------")
        print("-----------------------------------------")

        return HttpResponse(json.dumps(output), status=200)

    else:
        return render(request, 'chat.html')
Exemple #17
0
    # Procesamos data con la red neuronal y decodifcamos el intents
    try:
        # PROCESAMIENTO DE LA RED NEURONAL
        # tokenizar por palabras
        token = text_to_word_sequence(data)
        # obtener la secuencia
        seq = tok.texts_to_sequences(token)
        # encode la secuencia
        encoded = np.add.reduce(to_categorical(seq, size))
        
        pred = model.predict(np.array([encoded]))
        
        seq2 = np.argmax(pred, axis=None, out=None)
        
        intent_get = tok2.sequences_to_texts(np.array([[seq2]]))
        intent_get = intent_get[-1]

        if spread:
            print("#" * 20 + " SUMMARY " + "#" * 20)
            print("\t Context: {0}".format(context))
            print("\t Hope: {0}".format(hope))
            print("\t Data: {0}".format(data))
            print("\t Bag: {0}".format(bag))
            print("\t Intent: {0}".format(intent_get))
            print("#" * 49)
            time.sleep(5)

        # CONTEXTO CONVERSATION
        if hope and context in ["conversation"] and data not in [""]:
            hope = False
class Preprocessor:
    
    def __init__(self, dataset_name):
        
        self.name = dataset_name
        self.path = os.path.dirname(__file__) + '\\prepared-datasets\\' + self.name + '.pkl'
        self.eos_token = '<eos>'
        self.sos_token = '<sos>'
        self.tokenizer = None
        if not os.path.exists(self.path):
            print('loading dataset to disk. this may take a minute or two...')
            raw_dataset = self._load_dataset_from_tensorflow()
            ready_to_save_data = self._prepare_dataset_for_saving(raw_dataset)
            self._save_prepared_data(ready_to_save_data)
            print('dataset loaded to disk.')
    
    
    def _load_dataset_from_tensorflow(self):
        
        dataset = tfds.load(self.name, as_supervised=True, split=['train', 'test', 'validation'])
        
        return dataset
    
    
    def _prepare_dataset_for_saving(self, raw_data):
        
        # raw_data is a tuple (train_data, test_data, validation_data)
        
        train_data = self._prepare_data_as_supervised(raw_data[0])
        test_data = self._prepare_data_as_supervised(raw_data[1])
        validation_data = self._prepare_data_as_supervised(raw_data[2])
        
        return train_data, test_data, validation_data
    
    
    def _prepare_data_as_supervised(self, data):
        
        # data is a list_like of (input_text, target_text) elements
        # this function returns a list of (input_text, target_text) elements
        
        to_be_returned = []
        for sample in data:
            input_word_sequence = (self.sos_token + ' ' + sample[0] + ' ' + self.eos_token).numpy().decode('ASCII', 'ignore')
            target_word_sequence = (self.sos_token + ' ' + sample[1] + ' ' + self.eos_token).numpy().decode('ASCII', 'ignore')
            to_be_returned.append((input_word_sequence, target_word_sequence))
        
        return to_be_returned
    
    
    def _save_prepared_data(self, data):
        
        data_holder = self._DataHolder(data)
        self._save(data_holder)
    
    
    def _save(self, data_holder):
        
        with open(self.path, 'wb') as f:
            pickle.dump(data_holder, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()
        
    
    def _load(self, split):
        
        data_holder = None
        with open(self.path, 'rb') as f:
            data_holder = pickle.load(f)
        f.close()
        
        return data_holder.get_data(split)
        
    
    def load_preprocessed_data(self, split, vocab_size, max_input_len=400, max_target_len=150,
                               filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                               to_lower=True, padding='post'):

        ''' 
        split: a string with value 'train' or 'test' or 'validation'
        vocab_size: the size of vocabulary to be used for tokenization we use the most frequent vocab_size - 1 words.
           
        batch_size: batch size to be used by the DataLoaders during trainning.
           
        max_input_len: the max length of input sequences, all sequences that are
        longer will be excluded from the dataset (along with thier targets!!), default = 400.
            
        max_target_len: the max length of target sequences, all sequences that are
        longer will be excluded from the dataset (along with thier inputs!!), default = 150.
            
        filters: a string of characters to be filtered out from texts, default = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\\t\\n'.
            
        to_lower: whether to convert all text to lower during tokenization or not, default = True.
            
        padding: whether to use padding at the end ('post') or at the beggining of sequence ('pre'), 
        or not use padding at all (None), default = 'post'.
        '''
        
        assert type(split) == str
        
        if (split == 'train'):
            self.tokenizer = Tokenizer(num_words=vocab_size, filters=filters, lower=to_lower,
                                  oov_token='<unk>', split=' ')
        
        data = self._load(split)
        raw_data = data
        
        # ._load() returns either train, test or validation data depending on split
        # we also need the raw data for experiments, comparison and visualization later
        
        data = self._to_2_lists(data)#: (List, List)
        # each data set is converted from list of 2-elements tuples to a tuple of 2 lists
        
        if (split == 'train'):
            self.tokenizer.fit_on_texts(data[0])
        
        ''' we only fit the tokenizer on training inputs
            by 'fit' we mean contructing the vocab and initializing the 
            tokenizer for later use '''
        
        assert not (self.tokenizer == None)
        data = (self.tokenizer.texts_to_sequences(data[0]), 
                self.tokenizer.texts_to_sequences(data[1]))#: a tuple of 2 lists of sequences
        # using the tokenizer, we convert raw text samples into integer sequences
        
        data = self._filter_by_lengths(data, max_input_len, max_target_len)#: tuple of 2 lists of sequences
        ''' we removed long inputs and targets based on:
            max_input_len, max_target_len '''
        
        if not (padding == None):
            data = self._as_supervised_pad_list_of_sequences(data, padding=padding)#: a list of 5 tenors
            
        ''' we replaced the 2 lists of inputs targets with 5 tensors:
            (x_encoder, x_decoder, y, encoder_lengths, decoder_lengths)
            where x_encoder is the input of the encoder, x_decoder is the input 
            of the decoder, y is the target. encoder_lengths and decoder_lengths
            are the lengths of non-padded input sequences for encoder and decoder. 
            we need them for training later. they are used with pack_padded_sequence()
            and pad_packed_sequence() methods in pytorch '''
        
        # finally we create data loaders for training and evaluation loops
        
        return data, raw_data
    
    
    def _to_2_lists(self, data):
        
        inputs = []
        targets = []
        
        for sample in data:
            inputs.append(sample[0])
            targets.append(sample[1])
            
        return inputs, targets
    
    
    def _filter_by_lengths(self, data, max_input_len, max_target_len):
                
        length = len(data[0])
        i = 0
        while i < length:
            if (len(data[0][i]) > max_input_len) or (len(data[1][i]) > max_target_len):
                data[0].pop(i)
                data[1].pop(i)
                length -= 1
                i -= 1
            i += 1
        
        return data
    
    
    def _as_supervised_pad_list_of_sequences(self, data, padding='post'):
        
        inputs = data[0]
        decoder_inputs = [t[:-1] for t in data[1]]
        # we excluded the end of string token from the decoder_inputs
        
        targets = [t[1:] for t in data[1]]
        
        # we excluded the start of string token from the targets
        
        encoder_lengths = torch.LongTensor(self._get_lengths_from_sequences(inputs))
        decoder_lengths = torch.LongTensor(self._get_lengths_from_sequences(targets))
        
        inputs = torch.from_numpy(pad_sequences(inputs, padding=padding)).type(torch.LongTensor)
        targets = torch.from_numpy(pad_sequences(targets, padding=padding)).type(torch.LongTensor)
        decoder_inputs = torch.from_numpy(pad_sequences(decoder_inputs, padding=padding)).type(torch.LongTensor)
        
        to_be_returned = [inputs, decoder_inputs, targets, encoder_lengths, decoder_lengths]
          
        return to_be_returned
        
    
    def _get_lengths_from_sequences(self, sequences):
        
        to_be_returned = []
        for seq in sequences:
            to_be_returned.append(len(seq))
            
        return to_be_returned
    
    
    def _create_data_loader(self, split, vocab_size, batch_size, max_input_len=400, max_target_len=150,
                            filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True,
                            padding='post', shuffle=True, num_workers=6):

        data = self._SummarizationDataset(self, split, vocab_size, max_input_len=max_input_len,
                                          max_target_len=max_target_len,
                                          filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                                          to_lower=True, padding=padding)
        
        data = DataLoader(data, batch_size=batch_size, pin_memory=True, 
                          shuffle=shuffle, num_workers=num_workers)
        
        return data    
    
    def create_data_loaders(self, vocab_size, batch_size, num_workers=6, max_input_len=400, 
                           max_target_len=150, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                           to_lower=True, padding='post'):
        
        start = timer()
        
        train = self._create_data_loader('train', vocab_size, batch_size, num_workers=num_workers,
                                         max_input_len=max_input_len, max_target_len=max_target_len,
                                         filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                                         to_lower=True, padding=padding)
        test = self._create_data_loader('test', vocab_size, batch_size, num_workers=num_workers,
                                        max_input_len=max_input_len, max_target_len=max_target_len,
                                        filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                                        to_lower=True, padding=padding)
        validation = self._create_data_loader('validation', vocab_size, batch_size, num_workers=num_workers,
                                          max_input_len=max_input_len, max_target_len=max_target_len,
                                          filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                                          to_lower=True, padding=padding)
        
        print('preprocessing time: ', timer() - start)
        
        return train, test, validation
        
    
    def sequences_to_texts(self, sequences):
        return self.tokenizer.sequences_to_texts(sequences)
    
    def texts_to_sequences(self, texts):
        return self.tokenizer.texts_to_sequences(texts)
        
    def get_eos_token(self):
        return self.texts_to_sequences([[self.eos_token]])
        
    def get_sos_token(self):
        return self.texts_to_sequences([[self.sos_token]])
    
    def filters(self):
        return self.tokenizer.filters
    
    class _DataHolder:
        def __init__(self, data):
            
            self.data = data
        
        def get_data(self, split):
            
            assert type(split) == str
            
            if split == 'train':
                return self.data[0]
            
            if split == 'test':
                return self.data[1]
            
            if split == 'validation':
                return self.data[2]
            
            else:
                raise ValueError('valid values are: "train", "test", "validation".')
        
    
    class _SummarizationDataset(Dataset):
        def __init__(self, preprocessor, split, vocab_size, max_input_len=400,
                                    max_target_len=150, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                                    to_lower=True, padding='post'):
            
            self.data, self.raw_data = preprocessor.load_preprocessed_data(split, vocab_size, 
                                                max_input_len=max_input_len, max_target_len=max_target_len, 
                                                filters=filters, to_lower=to_lower, padding=padding)
            self.len = self.data[0].size(0)
            
        def __getitem__(self, index):
            
            data = (self.data[0][index], self.data[1][index], self.data[2][index],
                    self.data[3][index], self.data[4][index])
            raw_data = self.raw_data[index]
            
            return data, raw_data
        
        def __len__(self):
            
            return self.len
Exemple #19
0
# Obtenemos los datos de salida para el entrenamiento
clases = list(sorted(set(t)))
size_s = len(clases) + 1

tok2 = Tokenizer()
tok2.fit_on_texts(clases)
tokens_s = tok2.texts_to_sequences(t)

y_train = [np.add.reduce(to_categorical(Y, size_s)) for Y in tokens_s]
y_train = np.array(y_train)

# CREAMOS LA RED NEURONAL CON KERAS
model = keras.models.load_model('model.h5')
model.summary()

data = "I need to create a new account"
data = text_to_word_sequence(data)
secuence = tok.texts_to_sequences(data)

encode = np.add.reduce(to_categorical(secuence, size))
print("secuence -> {0}".format(secuence))

print("-" * 50)
print(np.argmax(to_categorical(secuence, size), axis=1))
print("-" * 50)

a = np.rint(model.predict(np.array([encode])))
print(a)
i = np.argmax(a, axis=None, out=None)
print(tok2.sequences_to_texts(np.array([[i]])))
Exemple #20
0
plt.show()

# generate the next 10 words in a sentence
best_model = load_model(model_file)
start = 'Today as i was leaving for work'
test_seqs = tokenizer.texts_to_sequences([start])
for i in range(10):
    test_seqs_padded = pad_sequences(test_seqs, maxlen=sequence_len,
                                     padding='pre', truncating='pre')

    # use best_model to generate the next word
    # (remember to convert from categorical to ordinal)
    # TODO: Replace _ANS_ with your answers
    next_word = best_model.predict([test_seqs_padded]).argmax(axis=1)
    test_seqs[0].append(next_word[0])
    print(tokenizer.sequences_to_texts(test_seqs))














def next_char(text, temperature=1):
    x_new = preprocess([text])
    y_proba = model.predict(x_new)[0, -1:, :]
    rescaled_logit = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logit, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['acc'])
model.summary()

# *******
# train the model on X (values) and Y (labels)
# *******
history = model.fit(X, Y, epochs=num_epochs, verbose=0)  # use verbose= 1 or 2 for output on training


# ***************************************************************
#
# predict
#
# ***************************************************************
sentences = ["i have a cute fluffy cat", "the cat is fluffy", "i like to dance"]

sequences = tokenizer.texts_to_sequences(sentences)
decoded_sentences = tokenizer.sequences_to_texts(sequences)

padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len)
predictions = model.predict(padded_sequences)

for i, prediction in enumerate(predictions):
  category = category_tags[np.argmax(prediction)]
  print(f"----- sekntence {i} -----")
  print(f"decoded text: {decoded_sentences[i]}")
  print(f"sentence: {sentences[i]} | tag: {category}  | prediction values: {prediction} | max prediction index: {np.argmax(prediction)}")
LSTM.fit(x_train,
         y_train,
         batch_size=64,
         epochs=30,
         validation_split=0.1,
         callbacks=[early_stopping_cb, reduce_learing_cb])

#prediction
string = "I really want to go to a beautiful place"
index_list = tokenizer.texts_to_sequences([string])[0]
for i in range(4):
    input_ = np.array(index_list[i:i + max_len - 1]).reshape(1, max_len - 1)
    predicted_results = LSTM.predict(input_)
    index_list.append(np.argmax(predicted_results))
word_list = tokenizer.sequences_to_texts([index_list])
output = " ".join(word_list)
print(output)

dump(tokenizer, open("tokenizer.pkl", "wb"))

#creating training samples
max_len = 40
x = []
y = []
for i in range(0, len(clean_tokens) - max_len - 1, 8):
    x_temp = clean_tokens[i:i + max_len]
    x_temp2 = [wordindex_dic.get(word, 0) for word in x_temp]
    y_temp = clean_tokens[i + 1:i + max_len + 1]
    y_temp2 = [wordindex_dic.get(word, 0) for word in y_temp]
    x.append(x_temp2)
   # generated_sequence = generated_sequence.reshape(1,308)
    for word in range(max_len):
        #ip_one_hot = one_hot(generated_sequence, num_classes)
        generated_sequence = generated_sequence.reshape(1,308)
        prediction = model.predict(
            generated_sequence[None], verbose = 0)[0]
        sampled_token = np.random.choice(
            np.arange(num_classes), p=prediction)
        #print(generated_sequence)
        generated_sequence = np.append(
            generated_sequence[0,1:],sampled_token)
        print(generated_sequence)
        #print(generated_sequence)
        #generated_sequence = generated_sequence.reshape(1,308)
        #generated_sequence = generated_sequence.astype(int)
    generated_txt = tk.sequences_to_texts([generated_sequence])[0]
    print("Sample {}: {}".format(i, generated_txt))

plt.plot(history.history['accuracy'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train accuracy'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')