def predict(self, sentence): xs = [] # tokens = [w.lower() for w in word_tokenize(sentence)] tokens = [w for w in word_tokenize(sentence)] wid = [ self.word2idx[token] if token in self.word2idx else 1 for token in tokens ] xs.append(wid) x = pad_sequences(xs, self.max_len) output = self.model.predict(x) return output[0]
def fit_text(data_file_path, label_type, max_vocab_size=None): if max_vocab_size is None: max_vocab_size = 5000 counter = collections.Counter() file = open(data_file_path, mode='rt', encoding='utf8') next(file) # skip header max_len = 0 labels = dict() for line in file: lst = line.strip().split(',') sentence = lst[0] if label_type == 'Predicate': label = lst[1] elif label_type == 'FrameNet': label = lst[2] # tokens = [x.lower() for x in word_tokenize(sentence)] tokens = [x for x in word_tokenize(sentence)] # Cased word 유지 for token in tokens: counter[token] += 1 max_len = max(max_len, len(tokens)) if label not in labels: # 라벨에 0부터 번호 부여(라벨당 개수 세는 것이 아님) labels[label] = len(labels) file.close() word2idx = collections.defaultdict(int) for idx, word in enumerate(counter.most_common(max_vocab_size)): word2idx[word[0]] = idx idx2word = {v: k for k, v in word2idx.items()} vocab_size = len(word2idx) + 1 model = dict() model['word2idx'] = word2idx model['idx2word'] = idx2word model['vocab_size'] = vocab_size model['max_len'] = max_len model['labels'] = labels return model
def fit(self, text_data_model, text_label_pairs, model_dir_path, batch_size=None, epochs=None, test_size=None, random_state=None): if batch_size is None: batch_size = 64 if epochs is None: epochs = 20 if test_size is None: test_size = 0.3 if random_state is None: random_state = 42 self.config = text_data_model self.idx2word = self.config['idx2word'] self.word2idx = self.config['word2idx'] self.max_len = self.config['max_len'] self.vocab_size = self.config['vocab_size'] self.labels = self.config['labels'] np.save(self.get_config_file_path(model_dir_path), self.config) self.create_model() json = self.model.to_json() open(self.get_architecture_file_path(model_dir_path), 'w').write(json) xs = [] ys = [] for text, label in text_label_pairs: # tokens = [x.lower() for x in word_tokenize(text)] tokens = [x for x in word_tokenize(text)] wid_list = list() for w in tokens: wid = 0 if w in self.word2idx: wid = self.word2idx[w] wid_list.append(wid) xs.append(wid_list) ys.append(self.labels[str(label)]) X = pad_sequences(xs, maxlen=self.max_len) Y = np_utils.to_categorical(ys, len(self.labels)) x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, stratify=Y, random_state=random_state) print('===========================================') print('Below is the shape of train/test dataset.') print('===========================================') print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) print('===========================================') weight_file_path = self.get_weight_file_path(model_dir_path) checkpoint = ModelCheckpoint(weight_file_path) print('===========================================') print('======== Now we are on training... ========') print('===========================================') history = self.model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), callbacks=[checkpoint], verbose=1) self.model.save_weights(weight_file_path) np.save(model_dir_path + '/' + WordVecCnn.model_name + '-history.npy', history.history) # score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1) # print('score: ', score[0]) # print('accuracy: ', score[1]) # print('f1: ', score[2]) # print('precision: ', score[3]) # print('recall: ', score[4]) return history
def fit(self, text_data_model, text_label_pairs, model_dir_path, test_size=None, random_state=None, epochs=None, batch_size=None): if epochs is None: epochs = 20 if batch_size is None: batch_size = 32 if test_size is None: test_size = 0.3 if random_state is None: random_state = 42 self.config = text_data_model self.idx2word = self.config['idx2word'] self.word2idx = self.config['word2idx'] self.max_len = self.config['max_len'] self.vocab_size = self.config['vocab_size'] self.labels = self.config['labels'] verbose = 1 config_file_path = WordVecMultiChannelCnn.get_config_file_path( model_dir_path) np.save(config_file_path, text_data_model) max_input_tokens = len(self.word2idx) self.model = self.define_model(self.max_len, max_input_tokens) open(self.get_architecture_file_path(model_dir_path), 'wt').write(self.model.to_json()) xs = [] ys = [] for text, label in text_label_pairs: # tokens = [x.lower() for x in word_tokenize(text)] tokens = [x for x in word_tokenize(text)] wid_list = list() for w in tokens: wid = 0 if w in self.word2idx: wid = self.word2idx[w] wid_list.append(wid) xs.append(wid_list) ys.append(self.labels[str(label)]) X = pad_sequences(xs, maxlen=self.max_len) Y = np_utils.to_categorical(ys, len(self.labels)) weight_file_path = WordVecMultiChannelCnn.get_weight_file_path( model_dir_path) checkpoint = ModelCheckpoint(weight_file_path) x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, stratify=Y, random_state=random_state) print('===========================================') print('Below is the shape of train/test dataset.') print('===========================================') print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) print('===========================================') print() print('===========================================') print('======== Now we are on training... ========') print('===========================================') history = self.model.fit([x_train, x_train, x_train], y_train, epochs=epochs, batch_size=batch_size, validation_data=([x_test, x_test, x_test], y_test), verbose=verbose, callbacks=[checkpoint]) # save the model self.model.save(weight_file_path) np.save( model_dir_path + '/' + WordVecMultiChannelCnn.model_name + '-history.npy', history.history) return history