Ejemplo n.º 1
0
    def __init__(self, word2idx, embedding_dim, batch_size, n_hidden, learning_rate, n_class, max_sentence_len, l2_reg, embedding, dim_z, pri_prob_y, decoder_type, grad_clip, n_hidden_ae, position_enc, bidirection_enc, position_dec, bidirection_dec, classifier_type, sharefc):
        super(SemiTABSA, self).__init__()

        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.n_class = n_class
        self.max_sentence_len = max_sentence_len
        self.l2_reg = l2_reg
        self.word2idx = word2idx
        self.dim_z = dim_z
        self.decoder_type = decoder_type
        self.classifier_type = classifier_type
        self.grad_clip = grad_clip
        self.n_hidden_ae = n_hidden_ae
        self.pri_prob_y = tf.Variable(pri_prob_y, trainable=False)
        self.position_enc = position_enc
        self.bidirection_enc = bidirection_enc
        self.position_dec = position_dec
        self.bidirection_dec = bidirection_dec
        self.sharefc = sharefc

        if embedding is None:
            logger.info('No embedding is given, initialized randomly')
            wemb_init = np.random.randn([len(word2idx), embedding_dim]) * 1e-2
            self.embedding = tf.get_variable('embedding', [len(word2idx), embedding_dim], initializer=tf.constant_initializer(wemb_init))
        elif isinstance(embedding, np.ndarray):
            logger.info('Numerical embedding is given with shape {}'.format(str(embedding.shape)))
            #self.embedding = tf.constant(embedding, name='embedding')
            self.embedding = tf.get_variable('embedding', [len(word2idx), embedding_dim], initializer=tf.constant_initializer(embedding), trainable=False)
        elif isinstance(embedding, tf.Tensor) or isinstance(embedding, tf.Variable):
            logger.info('Import tensor as the embedding: '.format(embedding.name))
            self.embedding = embedding
        else:
            raise Exception('Embedding type {} is not supported'.format(type(embedding)))

        #TODO: Take the network graph building codes to a new module. 
        #self.classifier = self.create_classifier(self.classifier_type)
        with tf.variable_scope('classifier'):
            if self.classifier_type == "TC":
                self.classifier = TCClassifier(word2idx=word2idx, 
                        embedding_dim=embedding_dim, 
                        n_hidden=n_hidden, 
                        learning_rate=learning_rate, 
                        n_class=n_class, 
                        max_sentence_len=max_sentence_len, 
                        l2_reg=l2_reg, 
                        embedding=self.embedding,
                        grad_clip=self.grad_clip,
                        )
            elif self.classifier_type == "TD":
                pass
            elif self.classifier_type == "MEM":
                #TODO: Add hyper-params Config.py
                self.classifier = MEMClassifier()
        
        with tf.variable_scope('encoder'):
            self.encoder = TCEncoder(word2idx=word2idx, 
                    embedding_dim=embedding_dim, 
                    n_hidden=n_hidden_ae, 
                    learning_rate=learning_rate, 
                    n_class=n_class, 
                    max_sentence_len=max_sentence_len, 
                    l2_reg=l2_reg, 
                    embedding=self.embedding,
                    dim_z=dim_z,
                    grad_clip=self.grad_clip,
                    position=self.position_enc,
                    bidirection=self.bidirection_enc,
                    )
        
        with tf.variable_scope('decoder'):
            self.decoder = TCDecoder(word2idx=word2idx, 
                    embedding_dim=embedding_dim, 
                    n_hidden=n_hidden_ae, 
                    learning_rate=learning_rate, 
                    n_class=n_class, 
                    max_sentence_len=max_sentence_len, 
                    l2_reg=l2_reg, 
                    embedding=self.embedding,
                    dim_z=dim_z,
                    decoder_type=self.decoder_type,
                    grad_clip=self.grad_clip,
                    position=self.position_dec,
                    bidirection=self.bidirection_dec,
                    sharefc=self.sharefc,
                    )

        self.klw = tf.placeholder(tf.float32, [], 'klw')
Ejemplo n.º 2
0
    def __init__(self, word2idx, target2idx, embedding_dim, batch_size, n_hidden, learning_rate, n_class, max_sentence_len, l2_reg, word_embedding, target_embedding, dim_z, pri_prob_y, decoder_type, grad_clip, n_hidden_ae, position_enc, bidirection_enc, position_dec, bidirection_dec, classifier_type):
        super(SemiTABSA, self).__init__()

        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.n_class = n_class
        self.max_sentence_len = max_sentence_len
        self.l2_reg = l2_reg
        self.word2idx = word2idx
        self.target2idx = target2idx
        self.dim_z = dim_z
        self.decoder_type = decoder_type
        self.classifier_type = classifier_type
        self.grad_clip = grad_clip
        self.n_hidden_ae = n_hidden_ae
        self.pri_prob_y = tf.Variable(pri_prob_y, trainable=False)
        self.position_enc = position_enc
        self.bidirection_enc = bidirection_enc
        self.position_dec = position_dec
        self.bidirection_dec = bidirection_dec
        if word_embedding is None:
            logger.info('No embedding is given, initialized randomly')
            wemb_init = np.random.randn([len(word2idx), embedding_dim]) * 1e-2
            self.word_embedding = tf.get_variable('word_embedding', [len(word2idx), embedding_dim], initializer=tf.constant_initializer(wemb_init))
        elif isinstance(word_embedding, np.ndarray):
            logger.info('Numerical embedding is given with shape {}'.format(str(word_embedding.shape)))
            self.word_embedding = tf.constant(word_embedding, name='embedding')
            #self.word_embedding = tf.get_variable('word_embedding', [len(word2idx), embedding_dim], initializer=tf.constant_initializer(word_embedding))
        elif isinstance(word_embedding, tf.Tensor):
            logger.info('Import tensor as the embedding: '.format(word_embedding.name))
            self.word_embedding = tf.constant(word_embedding)
        else:
            raise Exception('Embedding type {} is not supported'.format(type(word_embedding)))

        if target_embedding is None:
            logger.info('No embedding is given, initialized randomly')
            wemb_init = np.random.randn([len(target2idx), embedding_dim]) * 1e-2
            self.target_embedding = tf.get_variable('target_embedding', [len(target2idx), embedding_dim], initializer=tf.constant_initializer(wemb_init))
        elif isinstance(target_embedding, np.ndarray):
            logger.info('Numerical embedding is given with shape {}'.format(str(target_embedding.shape)))
            self.target_embedding = tf.constant(target_embedding, name='embedding')
#            self.target_embedding = tf.get_variable('target_embedding', [len(target2idx), embedding_dim], initializer=tf.constant_initializer(target_embedding))
        elif isinstance(target_embedding, tf.Tensor):
            logger.info('Import tensor as the embedding: '.format(target_embedding.name))
            self.target_embedding = target_embedding
        else:
            raise Exception('Embedding type {} is not supported'.format(type(embedding)))

        #TODO: Take the network graph building codes to a new module. 
        #self.classifier = self.create_classifier(self.classifier_type)
        with tf.variable_scope('classifier'):
            if self.classifier_type == "TC":
                self.classifier = TCClassifier(word2idx=word2idx, 
                        embedding_dim=embedding_dim, 
                        n_hidden=n_hidden, 
                        learning_rate=learning_rate, 
                        n_class=n_class, 
                        max_sentence_len=max_sentence_len, 
                        l2_reg=l2_reg, 
                        embedding=self.word_embedding,
                        grad_clip=self.grad_clip,
                        )
            elif self.classifier_type == "TD":
                pass
            elif self.classifier_type == "MEM":
            #TODO: Add hyper-params Config.py
                 word_embedding = np.vstack((word_embedding, np.zeros([1, self.embedding_dim])))

                 self.classifier = MEMClassifier(nwords=len(word2idx)+1,
                                     word2idx = word2idx,
                                     target2idx = target2idx,
                                     init_hid = 0.1,
                                     init_std = 0.01,
                                     init_lr=0.01,
                                     batch_size=self.batch_size,
                                     nhop=3,
                                     edim=self.embedding_dim,
                                     mem_size=79,
                                     lindim=300,
                                     max_grad_norm=100,
                                     pad_idx=len(word2idx),
                                     pre_trained_context_wt=word_embedding,
                                     pre_trained_target_wt=target_embedding)

            elif self.classifier_type == "IAN":
                 self.classifier = IANClassifier(word2idx=word2idx, 
                     embedding_dim=self.embedding_dim, 
                     n_hidden=self.n_hidden, 
                     learning_rate=self.learning_rate, 
                     n_class=self.n_class, 
                     max_sentence_len=self.max_sentence_len, 
                     l2_reg=self.l2_reg, 
                     embedding=word_embedding,
                     grad_clip=self.grad_clip)
                 

        with tf.variable_scope('encoder'):
            self.encoder = TCEncoder(word2idx=word2idx, 
                    embedding_dim=embedding_dim, 
                    n_hidden=n_hidden_ae, 
                    learning_rate=learning_rate, 
                    n_class=n_class, 
                    max_sentence_len=max_sentence_len, 
                    l2_reg=l2_reg, 
                    embedding=self.word_embedding,
                    dim_z=dim_z,
                    grad_clip=self.grad_clip,
                    position=self.position_enc,
                    bidirection=self.bidirection_enc,
                    )
        with tf.variable_scope('decoder'):
            self.decoder = TCDecoder(word2idx=word2idx, 
                    embedding_dim=embedding_dim, 
                    n_hidden=n_hidden_ae, 
                    learning_rate=learning_rate, 
                    n_class=n_class, 
                    max_sentence_len=max_sentence_len, 
                    l2_reg=l2_reg, 
                    embedding=self.word_embedding,
                    dim_z=dim_z,
                    decoder_type=self.decoder_type,
                    grad_clip=self.grad_clip,
                    position=self.position_dec,
                    bidirection=self.bidirection_dec,
                    )

        self.klw = tf.placeholder(tf.float32, [], 'klw')
def main(_):
    FLAGS = tf.app.flags.FLAGS

    import time, datetime
    timestamp = str(int(time.time()))
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    save_dir = FLAGS.save_dir + '/selftraining/'

    train = pkl.load(open(FLAGS.train_file_path, 'rb'), encoding='latin')
    unlabel = pkl.load(open(FLAGS.unlabel_file_path, 'rb'), encoding='latin')[:FLAGS.n_unlabel]
    test = pkl.load(open(FLAGS.test_file_path, 'rb'), encoding='latin')
    val = pkl.load(open(FLAGS.validate_file_path, 'rb'), encoding='latin')

    fns = [FLAGS.train_file_path,  FLAGS.test_file_path, FLAGS.unlabel_file_path]
    #data_dir = 'classifier/data/rest/bilstmattg-cbow/'
    data_dir = 'classifier/data/rest/tclstm/'
    emb_file = "../../../data/glove.6B/glove.6B.300d.txt"
    #emb_file = "../../../data/glove.840B/glove.840B.300d.txt"
    #emb_file = "../../../data/se2014task06//tabsa-rest/cbow.unlabel.300d.txt"
    word2idx, word_embedding = preprocess_data(fns, emb_file, data_dir)

    configproto = tf.ConfigProto()
    configproto.gpu_options.allow_growth = True
    configproto.allow_soft_placement = True
    with tf.Session(config=configproto) as sess:
        tf.global_variables_initializer().run()
        
        """
        if word_embedding is None:
            logger.info('No embedding is given, initialized randomly')
            wemb_init = np.random.randn([len(word2idx), embedding_dim]) * 1e-2
            word_embedding = tf.get_variable('word_embedding', [len(word2idx), embedding_dim], initializer=tf.constant_initializer(wemb_init))
        elif isinstance(word_embedding, np.ndarray):
            logger.info('Numerical embedding is given with shape {}'.format(str(word_embedding.shape)))
            word_embedding = tf.constant(word_embedding, name='embedding')
            #self.word_embedding = tf.get_variable('word_embedding', [len(word2idx), embedding_dim], initializer=tf.constant_initializer(word_embedding))
        elif isinstance(word_embedding, tf.Tensor):
            logger.info('Import tensor as the embedding: '.format(word_embedding.name))
            word_embedding = word_embedding
        else:
            raise Exception('Embedding type {} is not supported'.format(type(word_embedding)))

        if target_embedding is None:
            logger.info('No embedding is given, initialized randomly')
            wemb_init = np.random.randn([len(target2idx), embedding_dim]) * 1e-2
            target_embedding = tf.get_variable('target_embedding', [len(target2idx), embedding_dim], initializer=tf.constant_initializer(wemb_init))
        elif isinstance(target_embedding, np.ndarray):
            logger.info('Numerical embedding is given with shape {}'.format(str(target_embedding.shape)))
            target_embedding = tf.constant(target_embedding, name='embedding')
#            self.target_embedding = tf.get_variable('target_embedding', [len(target2idx), embedding_dim], initializer=tf.constant_initializer(target_embedding))
        elif isinstance(target_embedding, tf.Tensor):
            logger.info('Import tensor as the embedding: '.format(target_embedding.name))
            target_embedding = target_embedding
        else:
            raise Exception('Embedding type {} is not supported'.format(type(embedding)))
        """

        #TODO: Take the network graph building codes to a new module. 
        #self.classifier = self.create_classifier(self.classifier_type)

        classifier = TCClassifier(word2idx=word2idx, 
                     embedding_dim=FLAGS.embedding_dim, 
                     n_hidden=FLAGS.n_hidden, 
                     learning_rate=FLAGS.learning_rate, 
                     n_class=FLAGS.n_class, 
                     max_sentence_len=FLAGS.max_sentence_len, 
                     l2_reg=FLAGS.l2_reg, 
                     embedding=word_embedding,
                     grad_clip=FLAGS.grad_clip)

        selftraining(sess, classifier, train, unlabel, test, FLAGS)

    return