def update_tokenizer(self, init=False, **kwargs): if (init == True): tokenizer = Tokenizer(num_words=self.max_features) elif (init == False): with open(self.tokenizer_path) as handle: tokenizer = tokenizer_from_json(json.load(handle)) questions = [] answers = [] answer_categories = [] for element in self.question: questions.append(mecab.morphs(element.question)) if (element.answer != None): answers.append(mecab.morphs(element.answer)) answer_categories.append(mecab.morphs(element.answer_category)) else: answers.append("없음") answer_categories.append("없음") tokenizer.fit_on_texts(questions) tokenizer.fit_on_texts(answers) tokenizer.fit_on_texts(answer_categories) with open(self.tokenizer_path, "w", encoding="utf-8") as handle: json.dump(tokenizer.to_json(), handle) return tokenizer
def loader(): model = load_model('../assets/lstm1/rnn') with open('../assets/lstm1/rnn_tokenizer.json') as f: data = json.load(f) tokenizer = text.tokenizer_from_json(data) return model, tokenizer
def prepare_ans_ques_ref(ans, ques, ref): max_length = 40 trunc_type = 'post' with open(os.path.join('data', 'tokenizer.json')) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) ans_sequences = tokenizer.texts_to_sequences([ans]) ques_sequences = tokenizer.texts_to_sequences([ques]) new_list = ' '.join(ref) ref_sequences = tokenizer.texts_to_sequences([new_list]) ans_padded = pad_sequences(ans_sequences, maxlen=max_length, truncating=trunc_type) ques_padded = pad_sequences(ques_sequences, maxlen=max_length, truncating=trunc_type) ref_padded = pad_sequences(ref_sequences, maxlen=2 * max_length, truncating=trunc_type) return [ np.expand_dims(np.concatenate((ans_padded[0], ques_padded[0]), axis=0), axis=0), np.expand_dims(ref_padded[0], axis=0) ]
def generate_output(next_words, token_file, model_used): max_sequence_len = 32 #generated = '' #sentence = text[start_index: start_index + Tx] #sentence = '0'*Tx usr_input = input("Your input: ") print("\n") # zero pad the sentence to Tx characters. #sentence = str(usr_input) sys.stdout.write("Generating Psalmist word...\n") print("\n") #print("You: "+str(str(usr_input))) #print("Machine: "+str(sentence)) with open(token_file) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) for _ in range(next_words): token_list = tokenizer.texts_to_sequences([usr_input])[0] token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') #print(token_list) model = load_model(model_used) predicted = model.predict_classes(token_list, verbose=0) for word, index in tokenizer.word_index.items(): if index == predicted: output_word = word break usr_input += " " + output_word sys.stdout.write("Genesys-Machine: " + '"' + usr_input + '"')
def __init__(self): self.tokenizer_path = "./tokenizer/tokenizer.json" self.checkpoint_path = "./model/cp.ckpt" self.model_path = "./model/model.h5" self.max_features = 1000 self.maxlen = 100 # 학습할때 참조할 문장단 문자 갯수를 정의한다. stmt = text("select question, answer, answer_category from question") stmt = stmt.columns(Question.question, Question.answer, Question.answer_category) self.question = db_session.query( Question.question, Question.answer, Question.answer_category).from_statement(stmt).all() if (path.exists(self.tokenizer_path)): with open(self.tokenizer_path) as handle: self.tokenizer = tokenizer_from_json(json.load(handle)) else: self.tokenizer = update_tokenizer(init=True) if (path.exists(self.model_path)): self.model = load_model(self.model_path) else: self.model = self.create_model() self.learn()
def generate_sentences(model_path, seed_path, num_words): model = getmodel(31, 10000, 32, model_path) tokenizer_path = model_path + '.tokenizer.json' with open(tokenizer_path, 'r') as f: tokenizer = tokenizer_from_json(f.readlines()[0]) # with open(seed_path, 'r') as f: # seed = f.readlines()[0] seed = create_indexes_tape('seed3', tokenizer) seed_seq = seed # seed_seq = tokenizer.texts_to_sequences([seed])[0] # seed_seq = tf.keras.preprocessing.sequence.pad_sequences([seed_seq], 31)[0] seed_seq = list(seed_seq) pred = None out_seq = [] for _ in range(num_words): seed_seq.extend(out_seq) seq_input = seed_seq[-(31):] seq_input = np.expand_dims(seq_input, 0) res = model.predict([seq_input], 1) pred = res.squeeze().argmax() out_seq.append(pred) words = [tokenizer.index_word[w] for w in out_seq if w not in [0]] sentence = ' '.join(words).replace('<eom>', '\n') print(sentence)
def load_tokenizer(self, filename='tokens.json'): # read as <str> from JSON file with open(f'{self.source_path}/{filename}', 'r') as tokenfile: tokens_info = tokenfile.read() # from tensorflow.keras.preprocessing.text import tokenizer_from_json self.tokenizer = tokenizer_from_json(tokens_info) return self.tokenizer
def loadTokenizer(path_to_file): with open(path_to_file) as f: tk_json = json.load(f) tokenizer = tokenizer_from_json(tk_json) return tokenizer
def loadModelAndTokenizer(): # test 2 document_path = os.getcwd() + "/app/models/" model = keras.models.load_model(document_path) with open(os.getcwd() + '/app/models/tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return model, tokenizer
def load_tokenizer(self): try: with io.open(tokenizer_path) as f: json_string = json.load(f) self._tokenizer = tokenizer_from_json(json_string) except Exception as e: print("Model.load_tokenizer() failed. Error: {}".format(e)) return self
def __init__(self, cache_path=None, stop_words=None, **extra): if cache_path and os.path.exists(cache_path): with open(cache_path, 'r') as f: self._tk = tokenizer_from_json(f.read()) else: self._tk = Tokenizer(lower=True, **extra) self._cache_path = cache_path
def __init__(self, model_dir): model_path = os.path.join(model_dir, 'gru.h5') artifacts_path = os.path.join(model_dir, 'model_artifacts.pkl') self.model = load_model(model_path) self.artifacts = pickle.load(open(artifacts_path, 'rb')) tokenizer_config = self.artifacts['tokenizer_config'] self.tokenizer = tokenizer_from_json(json.dumps(tokenizer_config)) self.sequence_len = self.artifacts['sequence_len']
def train(dataset_path, run_hash, seq_len=32, vocab_size=10000, emb_dim=32, batch_size=128, epochs=20, train_split = 0.8, val_split = 0.2): logs_path, ckp_path, tok_path = check_dirs(run_hash) ckp_cb = tf.keras.callbacks.ModelCheckpoint( ckp_path, 'val_accuracy', save_best_only=False, save_weights_only=True) lr_cb = tf.keras.callbacks.LearningRateScheduler( create_lr_sched(epochs/2, epochs), True) tb_cb = tf.keras.callbacks.TensorBoard( logs_path, 10, True, True, embeddings_freq=10, embeddings_metadata=logs_path+'/meta.tsv') with open(tok_path, 'r') as f: tokenizer = tokenizer_from_json(f.read()) indexes_tape = create_indexes_tape(dataset_path, tokenizer) train_nbatches = int((len(indexes_tape)-seq_len) * train_split / batch_size) val_nbatches = int((len(indexes_tape)-seq_len) * val_split / batch_size) train_ds, val_ds = create_datasets( indexes_tape, train_nbatches, val_nbatches, batch_size, seq_len, vocab_size) model = getmodel(seq_len-1, vocab_size, emb_dim, ckp_path) embeddings = model.layers[0].weights[0].numpy() export_vocabulary(vocab_size, tokenizer.word_index, logs_path) export_embeddings(embeddings, logs_path) hist = model.fit( train_ds, batch_size=batch_size, epochs=epochs, steps_per_epoch=train_nbatches, validation_data=val_ds, callbacks=[ckp_cb, lr_cb, tb_cb])
def load_from_cache(model_cache, model_weights, tokenizer_cache, tags_cache): with open(model_cache, 'r') as model_f: model = model_from_json(json.load(model_f)) model.load_weights(str(model_weights)) with open(tokenizer_cache, 'r') as tokenizer_cache_f: tokenizer = tokenizer_from_json(json.load(tokenizer_cache_f)) with open(tags_cache, 'r') as tags_cache_f: tags = json.load(tags_cache_f) return model, tokenizer, tags
def prepare_embeddings(gcp_bucket, num_words, w2v_model_path, embedding_dim, json_tokenizer_path, output_emb_matrix_path, vocabulary_size_path): logging.basicConfig(level=logging.INFO) logging.info('Starting embbedings preparation step ..') logging.info('Input data:') logging.info('gcp_bucket:{}'.format(gcp_bucket)) logging.info('num_words:{}'.format(num_words)) logging.info('w2v_model_path:{}'.format(w2v_model_path)) logging.info('embedding_dim:{}'.format(embedding_dim)) logging.info('json_tokenizer_path:{}'.format(json_tokenizer_path)) logging.info('output_emb_matrix_path:{}'.format(output_emb_matrix_path)) logging.info('vocabulary_size_path:{}'.format(vocabulary_size_path)) storage_client = storage.Client() bucket = storage_client.bucket(gcp_bucket) model = Word2Vec() blob_w2v = bucket.get_blob(w2v_model_path) destination_uri = "/02_w2v_model.bin" blob_w2v.download_to_filename(destination_uri) w2v_model = model.wv.load(destination_uri) word_vectors = w2v_model.wv logging.info('STEP: PREP EMB (1/3) Word2Vec model loaded.') # Load JSON tokenizer with open(json_tokenizer_path) as f: json_token = json.load(f) tokenizer = tokenizer_from_json(json_token) word_index = tokenizer.word_index vocabulary_size = min(len(word_index) + 1, num_words) logging.info('STEP: PREP EMB (2/3) Tokenizer loaded.') embedding_matrix = np.zeros((vocabulary_size, embedding_dim), dtype=np.int32) for word, i in word_index.items(): if i >= num_words: continue try: embedding_vector = word_vectors[word] embedding_matrix[i] = embedding_vector except KeyError: embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embedding_dim) del (word_vectors) logging.info('STEP: PREP EMB (3/3) Embedding matrix generated.') #Save the matrix @ output_embd_matrix_path logging.info('Writing output data.') if not os.path.exists(os.path.dirname(output_emb_matrix_path)): os.makedirs(os.path.dirname(output_emb_matrix_path)) if not os.path.exists(os.path.dirname(vocabulary_size_path)): os.makedirs(os.path.dirname(vocabulary_size_path)) with open(output_emb_matrix_path, 'w') as f: embedding_matrix.tofile(output_emb_matrix_path) with open(vocabulary_size_path, 'w') as f: f.write(str(vocabulary_size)) logging.info('Prepare embeddings step finished.')
def tokenize(text): # load tokenizer max_len = 128 with open(BASE_DIR+"tokenizer.json") as json_file: tok_json = json.load(json_file) tok = tokenizer_from_json(tok_json) # tokenize sequences = tok.texts_to_sequences([text]) return sequence.pad_sequences(sequences,maxlen=max_len)
def prepare_embeddings( gcp_bucket: str, num_words: int, w2v_model_path: str, embedding_dim: int, json_tokenizer_path: InputPath(str), num_classes: int, output_emb_matrix_path: OutputPath(str) ) -> NamedTuple('PrepareEmbOutput', [('vocabulary_size', int)]): from gensim.models import Word2Vec from google.cloud import storage from tensorflow.keras.preprocessing.text import tokenizer_from_json import os import json import numpy as np from collections import namedtuple # Storage client for loading the w2v model storage_client = storage.Client() bucket = storage_client.bucket(gcp_bucket) # Load w2v model model = Word2Vec() blob_w2v = bucket.get_blob(w2v_model_path) destination_uri = '{}/{}'.format(".", blob_w2v.name) if not os.path.exists(destination_uri): os.mkdir("/model") blob_w2v.download_to_filename(destination_uri) w2v_model = model.wv.load(destination_uri) word_vectors = w2v_model.wv # Load Json tokenizer # blob_tok = bucket.get_blob(json_tokenizer_path) with open(json_tokenizer_path) as f: json_token = json.load(f) tokenizer = tokenizer_from_json(json_token) word_index = tokenizer.word_index vocabulary_size = min(len(word_index) + 1, num_words) embedding_matrix = np.zeros((vocabulary_size, embedding_dim), dtype=np.int32) for word, i in word_index.items(): if i >= num_words: continue try: embedding_vector = word_vectors[word] embedding_matrix[i] = embedding_vector except KeyError: embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embedding_dim) del (word_vectors) # Save the matrix @ output_embd_matrix_path embedding_matrix.tofile(output_emb_matrix_path) PrepareEmbOutput = namedtuple('PrepareEmbOuput', ['vocabulary_size']) return (PrepareEmbOutput(vocabulary_size))
def tokenize(corpus, ngrams=1, grams_join=" "): lst_corpus = ngrams_preprocess(corpus, ngrams=ngrams, grams_join=grams_join) with open('\\Users\\Zeden\\Desktop\\tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) lst_text2seq = tokenizer.texts_to_sequences(lst_corpus) X = preprocessing.sequence.pad_sequences(lst_text2seq, maxlen=15, padding="post", truncating="post") return X
def predictor(sentence): sentence = [sentence] with open('tokenizer_2.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) sequence = tokenizer.texts_to_sequences(sentence) padded = pad_sequences(sequence, padding='post', maxlen=100, truncating='post') model = load_model('fake_news_predictor_2.h5') prediction = model.predict_classes(padded) return prediction[0][0]
def load_tokenizer(dict_path: AnyStr) -> Tokenizer: """ 加载分词器工具 :param dict_path: 字典路径 :return: 分词器 """ if not os.path.exists(dict_path): raise FileNotFoundError("字典不存在,请检查后重试!") with open(dict_path, "r", encoding="utf-8") as dict_file: json_string = dict_file.read().strip().strip("\n") tokenizer = tokenizer_from_json(json_string=json_string) return tokenizer
def clean_split_pad_data(sentences, config_obj): """ Clean data, split into train/test groups, and pad sentences for training """ print("Cleaning data (This may take a few minutes)...") if config_obj.language == "english": stopwords_list = set(stopwords.words('english')) important_words_english = [ 'above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but' ] stopwords_list = stopwords_list - set(important_words_english) sentences_clean = [ " ".join(clean_words_eng(sentence, stopwords_list)) for sentence in sentences ] elif config_obj.language == "spanish": nlp = spacy.load('es_core_news_sm', disable=['ner', 'parser']) stopwords_list = set(stopwords.words('spanish')) important_words_spanish = [ 'encima', 'debajo', 'menos', 'abajo', 'más', 'tal', 'no', 'ni', 'solamente', 'entonces', 'que', 'demasiado', 'muy', 'también', 'sólo', 'pero' ] stopwords_list = stopwords_list - set(important_words_spanish) sentences_clean = [ " ".join(clean_words_esp(sentence, nlp, stopwords_list)) for sentence in sentences ] # Load tokenizer from json file tokenizer_name = "tokenizer_" + config_obj.language[:2] + ".json" with open(tokenizer_name) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) # Creating padded sequences from train and test data sequences_clean = tokenizer.texts_to_sequences(sentences_clean) padded = pad_sequences(sequences_clean, maxlen=config_obj.max_len, padding=config_obj.pad_type, truncating=config_obj.trunc_type) # Converting the list to numpy array sentences_np = np.array(padded) return sentences_np
def get_run_components(run_dir): # Load args config = utils.load_json(os.path.join(run_dir, 'config.json')) args = Namespace(**config) # Load tokenizers with open(os.path.join(run_dir, 'X_tokenizer.json'), 'r') as fp: X_tokenizer = tokenizer_from_json(json.load(fp)) y_tokenizer = LabelEncoder() y_tokenizer.classes_ = np.load(os.path.join(run_dir, 'y_tokenizer.npy'), allow_pickle=True) # Load model model = models.TextCNN(embedding_dim=args.embedding_dim, vocab_size=len(X_tokenizer.word_index) + 1, num_filters=args.num_filters, filter_sizes=args.filter_sizes, hidden_dim=args.hidden_dim, dropout_p=args.dropout_p, num_classes=len(y_tokenizer.classes_)) model.summary(input_shape=(10, )) # build it model_path = os.path.join(run_dir, 'model/cp.ckpt') model.load_weights(model_path) # Conv output model conv_outputs_model = models.ConvOutputsModel( vocab_size=len(X_tokenizer.word_index) + 1, embedding_dim=args.embedding_dim, filter_sizes=args.filter_sizes, num_filters=args.num_filters) conv_outputs_model.summary(input_shape=(10, )) # build it # Set weights conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights()) conv_layer_start_num = 1 for layer_num in range(conv_layer_start_num, conv_layer_start_num + len(args.filter_sizes)): conv_outputs_model.layers[layer_num].set_weights( model.layers[layer_num].get_weights()) return args, model, conv_outputs_model, X_tokenizer, y_tokenizer
def train_embedding(): MAX_WORDS = 10000 EMBEDDING_DIM = 300 embeddings_index = load_vectors(VECTORS) with open(TOKENIZER) as f: json_obj = json.load(f) tokenizer = tokenizer_from_json(json_obj) word_index = tokenizer.word_index num_words = min(MAX_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) for word, i in word_index.items(): if i >= num_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = list(embedding_vector) pickle.dump(embedding_matrix, open(EMBEDDING_MATRIX, "wb"))
def fit_tokenizer(load_exising=False): ''' Returns the tokenizer and if it doesn't exist will create it load_existing (Type: boolean): used to indicate whether to load existing tolenizer or create a new one... useful when training data updated ''' if load_exising: if subject != "": with open(subject + 'tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return tokenizer else: if subject != "": tokenizer = Tokenizer(num_words=num_words, oov_token='UNK') tokenizer.fit_on_texts(patterns) tokenizer_json = tokenizer.to_json() with io.open(subject + 'tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) return tokenizer
def post(self, request): try: max_len = 150 trunc_type = 'post' padding = 'post' model = load_model('api/res/model/sentiment_version_2.h5') print('Model loaded successfully.') with open('api/res/assets/tokenizer_version_2.json') as t: datas = json.load(t) tokenizer = tokenizer_from_json(datas) print('Tokenizer loaded successfully.') cleaned_review = CleanTokenize([request.data.get('review')]) test_sequences = tokenizer.texts_to_sequences(cleaned_review) test_padded = pad_sequences(test_sequences, maxlen=max_len, padding=padding) prob = model.predict(test_padded) if prob[0][0] <= 0.2: result = 1 elif prob[0][0] <= 0.4: result = 2 elif prob[0][0] <= 0.6: result = 3 elif prob[0][0] <= 0.8: result = 4 else: result = 5 # Saving data to db Model result_data = { 'review': request.data.get('review'), 'rating': result } serializer = SentimentSerializer(data=result_data) if serializer.is_valid(): serializer.save() return Response(data=serializer.data, status=status.HTTP_201_CREATED) except ValueError as V: return Response(data=V, status=status.HTTP_400_BAD_REQUEST)
def __init__(self, json_tokenizer, VOCAB_SIZE, max_seq_lenght=None, red_seq_from_common=True): self.tokenizer = tokenizer_from_json(json_tokenizer) self.VOCAB_SIZE = VOCAB_SIZE self.max_seq_lenght = max_seq_lenght self.red_seq_from_common = red_seq_from_common dictionary = self.tokenizer.word_index self.word2idx = {} self.idx2word = {} for k, v in dictionary.items(): if v < VOCAB_SIZE: self.word2idx[k] = v self.idx2word[v] = k if v >= VOCAB_SIZE - 1: continue self.idx_2_keep = [ self.word2idx[BOS.strip()], self.word2idx[EOS.strip()] ]
def sentimentclassifier(request): try: max_length = 100 trunc_type = 'post' mydata = request.data model = load_model('./app/model/sentiment.h5') with open('./app/model/tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) test_sequence = tokenizer.texts_to_sequences(list(mydata.values())) test_padded = pad_sequences(test_sequence, maxlen=max_length, padding='post', truncating=trunc_type) result = model.predict(test_padded) result = (result >= 0.5) new_df = pd.DataFrame(result, columns=['Status']) new_df = new_df.replace({True: 'Positive', False: 'Negative'}) return Response(new_df) except ValueError as e: return Response(e.args[0], status.HTTP_400_BAD_REQUEST)
from tensorflow.keras.models import load_model import json from tensorflow.keras.preprocessing.text import tokenizer_from_json import pandas as pd import app.algorithms.process_predictions as pp MAXWORDS = 3000 model = load_model("./data/models/nb_stream_fasttext_10k.h5") with open("./data/tokenizer_stream_10k.json") as f: json_obj = json.load(f) tokenizer = tokenizer_from_json(json_obj) vid = "cnpUNEWP1i8" channel = "MentourPilot" #For file naming only transcript, full_text, captionCount = pp.processVideo(vid) predictions = pp.getPredictions(model, tokenizer, full_text) df = pd.DataFrame(predictions) words = full_text.split(" ") df["text"] = words + ["N/A"] * (len(predictions) - len(words)) df.to_csv(f"./examples/{channel}_{vid}.csv", index=False) sponsorTimestamps = pp.getTimestamps(transcript, captionCount, predictions, words) print(sponsorTimestamps) with open(f"./examples/{channel}_{vid}.txt", 'w') as file: file.write("Timestamps:\n") for ts in sponsorTimestamps:
def loadTokenizer(self): with open(self.MODEL_DIR + 'tokenizer.json', 'r') as f: data = json.load(f) self.tokenizer = tokenizer_from_json(data)
import tensorflow as tf from tensorflow.keras.models import load_model import json from tensorflow.keras.preprocessing.text import tokenizer_from_json from tensorflow.keras.preprocessing.sequence import pad_sequences max_length = 100 trunc_type = 'post' model = load_model("sentiment.h5") with open('tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) sentence = input(str("Give your reviews: ")) def classify(sentence): test_sequence = tokenizer.texts_to_sequences([sentence]) test_padded = pad_sequences(test_sequence, maxlen=max_length, padding='post', truncating=trunc_type) result = model.predict(test_padded) if result[0][0] >= 0.5: return 1 else: return 0