def test(data, batch_size=64, filename='roc.png', **kwargs): global args assert args.checkpoint is not None model = RNNModel(feature_dims=data[0].feature_dim, model_dir=args.output_dir, **kwargs) model.restore(args.checkpoint) data = list(filter(lambda d: d.seq is not None, data)) for i in tqdm(range(0, len(data), batch_size)): x, y, length = get_feature_label(data[i:i + batch_size], length_limit=10000) predictions = model.predict(x, length) for l, p in zip(data[i:i + batch_size], predictions): l.prediction = p # if SimpleLengthModel.data_filter(data[i]): # x, y, length = get_feature_label(data[i:i+batch_size], length_limit=1000) # predictions = model.predict(x, length) # for l,p in zip(data[i:i+batch_size], predictions): # l.prediction = p # else: # for l in data[i:i+batch_size]: # l.prediction = 1 + l.length / 100000.0 predictions = list(map(attrgetter('prediction'), data)) labels = list(map(attrgetter('label'), data)) plot_roc(predictions, labels, filename=filename)
def predicting(self): vocab_to_id = get_vocab_to_id(self.train_data_path, self.vocab_file, False) data_helper = DataHelper(vocab_to_id) reader = open("data/predict_data", 'r') writer = open("data/res", 'w') dishs = [] file_data = [] for line in reader.readlines(): line = line.strip().decode("utf-8") line_split = line.split("\t") if len(line_split) != 2: continue type, dish_name = line_split dishs.append(dish_name) file_data.append(line_split) batch = data_helper.create_prediction_batch(dishs) with tf.Session() as sess: cnn_model = RNNModel(self.rnn_size, self.embedding_size, self.class_num, len(vocab_to_id), self.learning_rate, self.model_path) ckpt = tf.train.get_checkpoint_state(self.model_dir) cnn_model.saver.restore(sess, ckpt.model_checkpoint_path) prediction, pre_label = cnn_model.predict(sess, batch) pre_pre = sess.run(tf.nn.softmax(prediction)) print pre_label, pre_pre, prediction for idx, sub_review_lable in enumerate(pre_label): writer.write("{}\t{}\t{}\n".format(file_data[idx][0], file_data[idx][1], sub_review_lable))
def predict_dajare(args): dajare_raw = args.dajare weights_path = args.weights_path vocab_data_path = args.vocab_data_path tokenizer = TokenizerSpacy() dajare_words = tokenizer.tokenize_sentence(dajare_raw) logging.info(dajare_words) vocab = Vocab(vocab_data_path) dajare_labeled = vocab.convert_word2id(dajare_words) logging.info(dajare_labeled) batch_size = 30 T = 25 emb_size = 128 hidden_size = 128 dropout = 0.0 lr = 1e-3 vocab_size = vocab.vocab_num model = RNNModel(batch_size=batch_size, vocab_size=vocab_size, emb_size=emb_size, hidden_size=hidden_size, T=T, dropout=dropout, lr=lr, model_path=None) model.print_fn = logging.info model.load_weights(weights_path) probability = model.predict( model.predict(np.array([dajare_labeled], dtype=np.float32))) logging.info('Probability:', probability[0]) return probability
def train(args): checkpoint_path = args.checkpoint_path dajare_sentence = args.query nlp = spacy.load('ja_ginza_nopn') words = nlp(dajare_sentence) words = [w.orth_ for w in words] batch_size = 32 T = 32 emb_size = 128 hidden_size = 128 dropout = 0.0 lr = 1e-3 data_gen = DataForGenerator(batch_size=batch_size, T=T) data_gen.load_vocab('./vocab.csv', vocab_size=50000) words_id, _ = data_gen.preprocess([words], None) vocab_size = len(data_gen.vocab.word2id) print("Vocab size: ", vocab_size) model = RNNModel( batch_size=batch_size, vocab_size=vocab_size, emb_size=emb_size, hidden_size=hidden_size, T=T, dropout=dropout, lr=lr, model_path=None) model.load_weights(checkpoint_path) print(words) print(words_id) pred = model.predict(words_id[0]) print(pred) print(pred.shape)
def predicting_2(self): vocab_to_id = get_vocab_to_id(self.train_data_path, self.vocab_file, False) data_helper = DataHelper(vocab_to_id) data_generator = SentenceGenerator("data/other_data") batchManage = BatchManager(data_generator, self.batch_size, vocab_to_id) writer = open("data/res_other", "w") with tf.Session() as sess: models = RNNModel(self.rnn_size, self.embedding_size, self.class_num, len(vocab_to_id), self.learning_rate, self.model_path) ckpt = tf.train.get_checkpoint_state(self.model_dir) models.saver.restore(sess, ckpt.model_checkpoint_path) for batchs in batchManage.getBatches(): prediction, pre_label = models.predict(sess, batchs) for sub_review_lable in pre_label: writer.write(str(sub_review_lable) + "\n")
def predicting_1(self): vocab_to_id = get_vocab_to_id(self.train_data_path, self.vocab_file, False) data_helper = DataHelper(vocab_to_id) dishs = [u"网上的口碑什么的蛮好的一家店 专门打了个电话让这边的师傅上门帮我们家的小宝宝理了一个头发"] batch = data_helper.create_prediction_batch(dishs) with tf.Session() as sess: cnn_model = RNNModel(self.rnn_size, self.embedding_size, self.class_num, len(vocab_to_id), self.learning_rate, self.model_path) ckpt = tf.train.get_checkpoint_state(self.model_dir) cnn_model.saver.restore(sess, ckpt.model_checkpoint_path) prediction, pre_label = cnn_model.predict(sess, batch) pre_pre = sess.run(tf.nn.softmax(prediction)) print pre_label, pre_pre, prediction for idx, sub_review_lable in enumerate(pre_label): print "{}\t{}".format( dishs[idx], data_helper.get_cats_name(sub_review_lable))