def inference(dataset, inf_enc, inf_dec, removed_stopwords=False): dict_t, rev_dict_t, vocab_size = create_dictionary() #! Be able to change if it doesn't work hparams = create_hparams() steps = hparams['maxlen_output'] maxlen = hparams['maxlen'] pred = predict_sequence(inf_enc, inf_dec, pad_sequences( dataset, maxlen=maxlen, padding='post'), steps, removed_stopwords=removed_stopwords) return pred.strip()
def preprocess_text(text, removed_stopwords=False): ''' Preprocess all steps above Input: news content (String) Output: ( preprocessed tokenized news content (list), preprocessed tokenized news content removed stopwords (list) ) ''' hparams = create_hparams() text = remove_date(text) text = thai_digit_to_arabic_digit(text) text = basic_cleaner(text) tokenized_text = word_tokenize(text, engine='deepcut') tokenized_text = use_first_n_words(tokenized_text, n=hparams['maxlen']) if removed_stopwords: removed_stopwords_text = remove_stopwords(tokenized_text) return removed_stopwords_text return tokenized_text
from services.inferencer import Inferencer from preprocessed.preprocessed_thaigov import preprocess_text, word_to_index from tensorflow.keras.preprocessing.sequence import pad_sequences from config.hparams import create_hparams import pickle import tensorflow as tf import os import numpy as np hparams = create_hparams() maxlen_input = hparams['maxlen'] maxlen_output = hparams['maxlen_output'] with open('./decode/dictionary/gru_dict.pkl', 'rb') as f: dict_t = pickle.load(f) with open('./decode/dictionary/gru_rev_dict.pkl', 'rb') as f: rev_dict_t = pickle.load(f) #define_n_first_words N_FIRST_CONTENT = maxlen_input #50 N_FIRST_HEADLINE = maxlen_output #22 BATCH_SIZE = 64 embedding_dim = 256 units = 1024 vocab_inp_size = len(dict_t) vocab_tar_size = len(dict_t) ## ! use only n first words for headline generation