class Pungen: def __init__(self, **kwargs): self.filepath = kwargs.get('filepath') self.embedding_layer = None def _parse_corpus(self, min_seq_len, filepath): print('Indexing word vectors.') self.texts = [] with open(filepath, encoding='utf-8') as fp: for line in fp: if line == "\n": continue self.texts.append(line) self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters=TOKEN_FILTER) self.tokenizer.fit_on_texts(self.texts) self.sequences = self.tokenizer.texts_to_sequences(self.texts) self.sequences = [x for x in self.sequences if len(x) >= min_seq_len] self.word_index = self.tokenizer.word_index print('Found %s unique tokens.' % len(self.word_index)) print('Found %s texts.' % len(self.sequences)) def prepare_emb(self, emb_dim, input_length): print('Indexing word vectors.') emb_name = 'glove.6B.' + str(emb_dim) + "d.txt" self.embeddings_index = {} with open(os.path.join(GLOVE_DIR, emb_name), encoding='utf-8') as f: for line in f: word, coefs = line.split(maxsplit=1) coefs = np.fromstring(coefs, 'f', sep=' ') self.embeddings_index[word] = coefs print('Found %s word vectors.' % len(self.embeddings_index)) # prepare embedding matrix num_words = MAX_NUM_WORDS self.embedding_matrix = np.zeros((num_words, emb_dim)) for word, i in self.word_index.items(): if i >= num_words: continue embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.embedding_matrix[i] = embedding_vector # load pre-trained word embeddings into an Embedding layer # note that we set trainable = False so as to keep the embeddings fixed self.embedding_layer = Embedding(num_words, emb_dim, embeddings_initializer=Constant( self.embedding_matrix), input_length=input_length, trainable=False) def check_generator(self): texts = self.tokenizer.sequences_to_texts(self.sequences) if len(texts) != len(self.texts): print("Different sizes of texts") return filter = set(TOKEN_FILTER) for i in range(len(texts)): if texts[i].lower() != self.texts[i][:-1].lower(): if any((c in filter) for c in self.texts[i][:-1].lower()): continue print(texts[i], self.texts[i][:-1]) print(self.texts[i][:-1].lower()) print("Tokenizer failed to tokenize properly!") return print("Tokenizer check was succesfull!") def form_pun(self, eval_path): retrieve = Retrieve(sentence_path=TEXT_DATA_DIR + TEXT_DATA, pun_path=PUN_DATA_DIR + PUN_DATA) (pun, sentence, score) = retrieve.retrieve() if not sentence: print("No sentence with word {} was found. Exiting...".format( pun[1])) raise Exception() text = word_tokenize(sentence) tokenized = nltk.pos_tag(text) print(tokenized) print(sentence, pun[0], pun[1]) pre = self.tokenizer.texts_to_sequences([sentence]) wp = self.tokenizer.texts_to_sequences([pun[0]]) wa = self.tokenizer.texts_to_sequences([pun[1]]) if (not wa[0]) or (not wp[0]): print( "The pair of pun and word does not exist in the parsed corpus. Exit..." ) raise Exception() index_wa = -1 for seq in pre[0]: index_wa = index_wa + 1 if seq == wa[0][0]: pre[0][index_wa] = wp[0][0] break wordsimilarity = WordSimilarity() wordsimilarity.word2vec() wordsimilarity.load() try_limit = 5 try_count = 0 index_topic = 0 while True: try: topic_word = None for i in range(index_topic, len(tokenized)): (word, pos) = tokenized[i] if (pos == 'NNP'): topic_word = "man" print(word, pos) index_topic = index_topic + 1 break if (pos == 'NN') or (pos == 'PRP') or (pos == 'NNS') or ( pos == 'PRP$'): topic_word = word print(word, pos) index_topic = index_topic + 1 break index_topic = index_topic + 1 result = wordsimilarity.getSimilar([topic_word, pun[0]], [pun[1]], 10) other_result = wordsimilarity.getSimilar([pun[0]], [], 10) break except KeyError: print("Word {} is not in vocabulary, try with the next one". format(topic_word)) try_count = try_count + 1 if try_limit == try_count: print("Limit of trys has been reached. Exit...") raise Exception() eval_surprisal = Evaluate() eval_surprisal.load_model(eval_path) finals = [] mean_amalgam = 0 for (word, prob) in result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_amalgam = mean_amalgam + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) finals.append(post_smoothing) print(post_smoothing) print(finals) print(mean_amalgam / 10) other_finals = [] mean_similar = 0 for (word, prob) in other_result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_similar = mean_similar + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) other_finals.append(post_smoothing) print(post_smoothing) print(other_finals) print(mean_similar / 10) return finals.extend(other_finals) def train_predict_model(self, model_params): predict_word = WordPredict(max_len=MAX_LEN, max_words=MAX_NUM_WORDS, emb_layer=self.embedding_layer) predict_word.build_model(**model_params) predict_word.compile_model(model_params) generator = Generator(sequences=self.sequences, batch_size=PREDICT_BS, max_words=MAX_NUM_WORDS, max_len=MAX_LEN, split=PREDICT_SPLIT) predict_word.train(generator, PREDICT_BS, PREDICT_SPLIT, PREDICT_EPOCHS) return predict_word def load_predict_model(self, path): predict_word = load_model(path) return predict_word def train_dac_model(self, model_params): dac = DAC() smoother_model = dac.build_model(hidden_sizes=[64, 64], seq_len=50, no_words=40000, emb_layer=self.embedding_layer, lr=0.01) generator = Generator(sequences=self.sequences, batch_size=SMOOTH_BS, max_words=MAX_NUM_WORDS, max_len=MAX_LEN, split=SMOOTH_SPLIT) smoother_model = dac.train(generator, full_model=smoother_model, model_params=model_params, bs=SMOOTH_BS, split=SMOOTH_SPLIT, pretrain_epochs=4, epochs=SMOOTH_EPOCHS) def run(self, predict_path, smoother_path, eval_path): self._parse_corpus(MIN_SEQ_LEN, TEXT_DATA_DIR + TEXT_DATA) self.prepare_emb(EMBEDDING_DIM, MAX_LEN) predict_model = None if predict_path is None: model_params = { 'lstm': [16], 'merge_layer': 'concat', 'dense': { 'size': [64, 32], 'act': 'elu', 'dropout': 0 }, 'optimizer': 'adam', 'lr': 0.0005 } predict_model = self.train_predict_model(model_params) else: pass #predict_model = self.load_predict_model(predict_path) #smoother_model = None if smoother_path is None: model_params = {'size': [64, 64], 'lr': 0.01} #smoother_model = self.train_dac_model(model_params) else: self.dac = DAC() self.dac.load_model(smoother_path) #GENERATE PUN while True: try: final = pungen.form_pun(eval_path) break except Exception: pass print(final)