class NNClassifier(Classifier): def __init__(self, doc2vec): super(NNClassifier, self).__init__(doc2vec) self.nn_des = { 'layer_description': [ { 'name': 'input', 'unit_size': 100, }, { 'name': 'hidden1', 'active_fun': tf.nn.relu, 'unit_size': 400, }, { 'name': 'output', 'active_fun': None, 'unit_size': 59, }, ], } self.max_pass = 5000 self.batch_size = 10000 self.step_to_report_loss = 5 self.step_to_eval = 10 self.nn_model = NN(self.nn_des) self.learning_rate = 0.01 def get_data(self, train_docs): Log.info(self, 'Get vectorized data for corpus...') X = [] y = [] infos = [] for doc in train_docs: X.append(self.doc2vec.infer_docvec(doc.words)) y.append(doc.topic_id) infos.append( (doc.doc_no, doc.tags, doc.topic_id, doc.topic, doc.url)) #if len(y)>1000: #break return np.array(X), y, infos def batch_iter(self, X, y): num_examples = len(y) max_step = num_examples // self.batch_size if max_step * self.batch_size < len(y): max_step += 1 #shuffle data perm = np.arange(num_examples) np.random.shuffle(perm) X = X[perm] y = np.array(y)[perm].tolist() for step in range(max_step): #print('batch', step, step*self.batch_size, '->', (step+1)*self.batch_size) batch_X = X[step * self.batch_size:(step + 1) * self.batch_size, :] batch_y = y[step * self.batch_size:(step + 1) * self.batch_size] #print('batchX', batch_X.shape, 'y', len(batch_y)) yield batch_X, batch_y def evaluate(self, sess, eval_op, X, Y, x_data, y_data): true_count = 0 total = 0 for batch_X, batch_y in self.batch_iter(x_data, y_data): true_count += sess.run(eval_op, feed_dict={ self.X: batch_X, self.Y: batch_y }) total += len(batch_y) return true_count / float(total), total def fit(self, train_docs, test_docs): X_train, y_train, infos_train = self.get_data(train_docs) X_test, y_test, infos_test = self.get_data(test_docs) with tf.Graph().as_default(): self.X = tf.placeholder(tf.float32, shape=(None, None)) self.Y = tf.placeholder(tf.int32, shape=(None)) self.predict_op = self.nn_model.inference(self.X) self.loss_op = self.nn_model.loss(self.predict_op, self.Y) self.train_op = self.nn_model.training(self.loss_op, self.learning_rate) self.eval_op = self.nn_model.evaluation(self.predict_op, self.Y) self.sess = tf.Session() saver = tf.train.Saver() init = tf.initialize_all_variables() self.sess.run(init) #tf.global_variables_initializer() #for pas in range(self.max_pass): pas = 0 best_acc = -1 while True: #print('----pas {}'.format(pas)) loss_arr = [] for i, (batch_X, batch_y) in enumerate( self.batch_iter(X_train, y_train)): #print('batch', i) _, loss_value = self.sess.run( [self.train_op, self.loss_op], feed_dict={ self.X: batch_X, self.Y: batch_y }) if pas % self.step_to_report_loss == 0 or pas + 1 == self.max_pass: loss_arr.append(loss_value) if len(loss_arr) > 0: print('------pas %d, average loss: %0.3f' % (pas, np.mean(loss_arr))) if pas % self.step_to_eval == 0 or pas + 1 == self.max_pass: train_score, train_total = self.evaluate( self.sess, self.eval_op, self.X, self.Y, X_train, y_train) test_score, test_total = self.evaluate( self.sess, self.eval_op, self.X, self.Y, X_test, y_test) print( '======train score: {}({}), test_score: {}({})'.format( train_score, train_total, test_score, test_total)) if test_score > best_acc: saver.save(self.sess, 'nn.best.model') best_acc = test_score pas += 1 def predict(self, doc_words): pass def score(self, train_docs): pass