def fit_text(data_file_path, max_vocab_size=None): if max_vocab_size is None: max_vocab_size = 5000 counter = collections.Counter() file = open(data_file_path, mode='rt', encoding='utf8') max_len = 0 labels = dict() for line in file: label, sentence = line.strip().split('\t') tokens = [x.lower() for x in word_tokenize(sentence)] for token in tokens: counter[token] += 1 max_len = max(max_len, len(tokens)) if label not in labels: labels[label] = len(labels) file.close() word2idx = collections.defaultdict(int) for idx, word in enumerate(counter.most_common(max_vocab_size)): word2idx[word[0]] = idx idx2word = {v: k for k, v in word2idx.items()} vocab_size = len(word2idx) + 1 model = dict() model['word2idx'] = word2idx model['idx2word'] = idx2word model['vocab_size'] = vocab_size model['max_len'] = max_len model['labels'] = labels return model
def predict(self, sentence): xs = [] tokens = [w.lower() for w in word_tokenize(sentence)] wid = [self.word2idx[token] if token in self.word2idx else len(self.word2idx) for token in tokens] xs.append(wid) x = pad_sequences(xs, self.max_len) output = self.model.predict(x) return output[0]
def fit(self, text_data_model, text_label_pairs, model_dir_path, test_size=None, random_state=None, epochs=None, batch_size=None): if epochs is None: epochs = 10 if batch_size is None: batch_size = 16 if test_size is None: test_size = 0.3 if random_state is None: random_state = 42 self.config = text_data_model self.idx2word = self.config['idx2word'] self.word2idx = self.config['word2idx'] self.max_len = self.config['max_len'] self.vocab_size = self.config['vocab_size'] self.labels = self.config['labels'] verbose = 1 config_file_path = WordVecMultiChannelCnn.get_config_file_path(model_dir_path) np.save(config_file_path, text_data_model) max_input_tokens = len(self.word2idx) self.model = self.define_model(self.max_len, max_input_tokens) open(self.get_architecture_file_path(model_dir_path), 'wt').write(self.model.to_json()) xs = [] ys = [] for text, label in text_label_pairs: tokens = [x.lower() for x in word_tokenize(text)] wid_list = list() for w in tokens: wid = 0 if w in self.word2idx: wid = self.word2idx[w] wid_list.append(wid) xs.append(wid_list) ys.append(self.labels[label]) X = pad_sequences(xs, maxlen=self.max_len) Y = np_utils.to_categorical(ys, len(self.labels)) weight_file_path = WordVecMultiChannelCnn.get_weight_file_path(model_dir_path) checkpoint = ModelCheckpoint(weight_file_path) history = self.model.fit([X, X, X], Y, epochs=epochs, batch_size=batch_size, validation_split=test_size, verbose=verbose, callbacks=[checkpoint]) # save the model self.model.save(weight_file_path) np.save(model_dir_path + '/' + WordVecMultiChannelCnn.model_name + '-history.npy', history.history) return history
def main(): random_state = 42 np.random.seed(random_state) current_dir = os.path.dirname(__file__) sys.path.append(os.path.join(current_dir, '..')) current_dir = current_dir if current_dir is not '' else '.' data_file_path = current_dir + '/data/umich-sentiment-train.txt' from keras_sentiment_analysis.library.utility.simple_data_loader import load_text_label_pairs from keras_sentiment_analysis.library.utility.tokenizer_utils import word_tokenize text_label_pairs = load_text_label_pairs(data_file_path) shuffle(text_label_pairs) config_file_path = current_dir + '/models/tf/wordvec_cnn_lstm.csv' first_line = True max_len = 0 word2idx = dict() with open(config_file_path, 'rt', encoding='utf-8') as f: for line in f: if first_line: first_line = False max_len = int(line.strip()) else: if line.startswith('label'): pass else: word, idx = line.strip().split('\t') idx = int(idx) word2idx[word] = idx with tf.gfile.FastGFile(current_dir + '/models/tf/wordvec_cnn_lstm.pb', 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) _ = tf.import_graph_def(graph_def, name='') with tf.Session() as sess: [print(n.name) for n in sess.graph.as_graph_def().node] predict_op = sess.graph.get_tensor_by_name('output_node0:0') for i in range(20): sentence, label = text_label_pairs[i] xs = [] tokens = [w.lower() for w in word_tokenize(sentence)] wid = [ word2idx[token] if token in word2idx else len(word2idx) for token in tokens ] xs.append(wid) x = pad_sequences(xs, max_len) predicted = sess.run( predict_op, feed_dict={ "embedding_1_input:0": x, 'spatial_dropout1d_1/keras_learning_phase:0': 0 }) print(predicted)