コード例 #1
0
ファイル: main.py プロジェクト: VNGResearch/doc2vec
class NNClassifier(Classifier):
    def __init__(self, doc2vec):
        super(NNClassifier, self).__init__(doc2vec)

        self.nn_des = {
            'layer_description': [
                {
                    'name': 'input',
                    'unit_size': 100,
                },
                {
                    'name': 'hidden1',
                    'active_fun': tf.nn.relu,
                    'unit_size': 400,
                },
                {
                    'name': 'output',
                    'active_fun': None,
                    'unit_size': 59,
                },
            ],
        }
        self.max_pass = 5000
        self.batch_size = 10000
        self.step_to_report_loss = 5
        self.step_to_eval = 10
        self.nn_model = NN(self.nn_des)
        self.learning_rate = 0.01

    def get_data(self, train_docs):
        Log.info(self, 'Get vectorized data for corpus...')
        X = []
        y = []
        infos = []
        for doc in train_docs:
            X.append(self.doc2vec.infer_docvec(doc.words))
            y.append(doc.topic_id)
            infos.append(
                (doc.doc_no, doc.tags, doc.topic_id, doc.topic, doc.url))
            #if len(y)>1000:
            #break
        return np.array(X), y, infos

    def batch_iter(self, X, y):
        num_examples = len(y)
        max_step = num_examples // self.batch_size
        if max_step * self.batch_size < len(y):
            max_step += 1
        #shuffle data
        perm = np.arange(num_examples)
        np.random.shuffle(perm)
        X = X[perm]
        y = np.array(y)[perm].tolist()

        for step in range(max_step):
            #print('batch', step, step*self.batch_size, '->', (step+1)*self.batch_size)
            batch_X = X[step * self.batch_size:(step + 1) * self.batch_size, :]
            batch_y = y[step * self.batch_size:(step + 1) * self.batch_size]
            #print('batchX', batch_X.shape, 'y', len(batch_y))
            yield batch_X, batch_y

    def evaluate(self, sess, eval_op, X, Y, x_data, y_data):
        true_count = 0
        total = 0
        for batch_X, batch_y in self.batch_iter(x_data, y_data):
            true_count += sess.run(eval_op,
                                   feed_dict={
                                       self.X: batch_X,
                                       self.Y: batch_y
                                   })
            total += len(batch_y)

        return true_count / float(total), total

    def fit(self, train_docs, test_docs):
        X_train, y_train, infos_train = self.get_data(train_docs)
        X_test, y_test, infos_test = self.get_data(test_docs)

        with tf.Graph().as_default():
            self.X = tf.placeholder(tf.float32, shape=(None, None))
            self.Y = tf.placeholder(tf.int32, shape=(None))

            self.predict_op = self.nn_model.inference(self.X)
            self.loss_op = self.nn_model.loss(self.predict_op, self.Y)
            self.train_op = self.nn_model.training(self.loss_op,
                                                   self.learning_rate)
            self.eval_op = self.nn_model.evaluation(self.predict_op, self.Y)

            self.sess = tf.Session()
            saver = tf.train.Saver()
            init = tf.initialize_all_variables()
            self.sess.run(init)
            #tf.global_variables_initializer()

            #for pas in range(self.max_pass):
            pas = 0
            best_acc = -1
            while True:
                #print('----pas {}'.format(pas))
                loss_arr = []
                for i, (batch_X, batch_y) in enumerate(
                        self.batch_iter(X_train, y_train)):
                    #print('batch', i)
                    _, loss_value = self.sess.run(
                        [self.train_op, self.loss_op],
                        feed_dict={
                            self.X: batch_X,
                            self.Y: batch_y
                        })

                    if pas % self.step_to_report_loss == 0 or pas + 1 == self.max_pass:
                        loss_arr.append(loss_value)
                if len(loss_arr) > 0:
                    print('------pas %d, average loss: %0.3f' %
                          (pas, np.mean(loss_arr)))

                if pas % self.step_to_eval == 0 or pas + 1 == self.max_pass:
                    train_score, train_total = self.evaluate(
                        self.sess, self.eval_op, self.X, self.Y, X_train,
                        y_train)
                    test_score, test_total = self.evaluate(
                        self.sess, self.eval_op, self.X, self.Y, X_test,
                        y_test)
                    print(
                        '======train score: {}({}), test_score: {}({})'.format(
                            train_score, train_total, test_score, test_total))
                    if test_score > best_acc:
                        saver.save(self.sess, 'nn.best.model')
                        best_acc = test_score
                pas += 1

    def predict(self, doc_words):
        pass

    def score(self, train_docs):
        pass