def _build_graph(self): """ Builds the computation graph with Tensorflow """ start_t = time.time() self._setup_placeholders() self._embed() self._encode() self._fuse() self._decode() self._compute_loss() self._create_train_op() self.logger.info('Time to build graph: {} s'.format(time.time() - start_t)) param_num = total_params(tf.trainable_variables()) self.logger.info('There are {} parameters in the model'.format(param_num))
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True, demo=False, graph=None): self.config = config self.demo = demo self.graph = graph if graph is not None else tf.Graph() with self.graph.as_default(): self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") if self.demo: self.c = tf.placeholder(tf.int32, [None, config.test_para_limit], "context") self.q = tf.placeholder(tf.int32, [None, config.test_ques_limit], "question") self.ch = tf.placeholder( tf.int32, [None, config.test_para_limit, config.char_limit], "context_char") self.qh = tf.placeholder( tf.int32, [None, config.test_ques_limit, config.char_limit], "question_char") self.y1 = tf.placeholder(tf.int32, [None, config.test_para_limit], "answer_index1") self.y2 = tf.placeholder(tf.int32, [None, config.test_para_limit], "answer_index2") else: self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next( ) # self.word_unk = tf.get_variable("word_unk", shape = [config.glove_dim], initializer=initializer()) self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( word_mat, dtype=tf.float32), trainable=False) self.char_mat = tf.get_variable("char_mat", initializer=tf.constant( char_mat, dtype=tf.float32)) self.c_mask = tf.cast(self.c, tf.bool) self.q_mask = tf.cast(self.q, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) if opt: N, CL = config.batch_size if not self.demo else 1, config.char_limit self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen]) self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen]) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit self.ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) self.forward() total_params() if trainable: self.lr = tf.minimum( config.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step)
def __init__(self, config, word_mat=None, char_mat_trainable=None, char_mat_fix=None, test=False): # hyper-parameter self.char_dim = config['char_dim'] self.cont_limit = config['cont_limit'] if not test else 1000 self.ques_limit = config['ques_limit'] if not test else 50 self.char_limit = config['char_limit'] self.ans_limit = config['ans_limit'] self.filters = config['filters'] self.char_filters = config['char_filters'] self.batch_size = config['batch_size'] self.l2_norm = config['l2_norm'] self.decay = config['decay'] self.learning_rate = config['learning_rate'] self.grad_clip = config['grad_clip'] self.init_lambda = config['init_lambda'] self.gamma_b = config['gamma_b'] self.gamma_c = config['gamma_c'] self.use_elmo = config['use_elmo'] self.use_cove = config['use_cove'] self.use_feat = config['use_feat'] self.use_rlloss = config['use_rlloss'] self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") self.dropout_rnn = tf.placeholder_with_default(0.0, (), name="dropout_rnn") self.dropout_emb = tf.placeholder_with_default(0.0, (), name="dropout_emb") self.dropout_att = tf.placeholder_with_default(0.0, (), name="dropout_att") self.un_size = tf.placeholder_with_default(self.batch_size, (), name="un_size") self.rlw = tf.placeholder_with_default(0.0, (), name="rlloss_weights") # embedding layer self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( word_mat, dtype=tf.float32), trainable=False) with tf.variable_scope("Input_Embedding_Mat"): self.char_mat = tf.get_variable( "char_mat", initializer=np.concatenate([char_mat_trainable, char_mat_fix], axis=0), trainable=True) # input tensor self.contw_input = tf.placeholder(tf.int32, [None, None], "context_word") self.quesw_input = tf.placeholder(tf.int32, [None, None], "question_word") self.contc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "context_char") self.quesc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "question_char") self.y_start = tf.placeholder(tf.int32, [None, None], "answer_start_index") self.y_end = tf.placeholder(tf.int32, [None, None], "answer_end_index") self.contw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'contw_elmo_id') self.quesw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'quesw_elmo_id') if self.use_feat: self.cont_feat = tf.placeholder(tf.float32, [None, None, 73], "cont_feat") self.ques_feat = tf.placeholder(tf.float32, [None, None, 73], "ques_feat") self.old_char_mat = tf.placeholder(tf.float32, [None, None], "old_char_mat") self.assign_char_mat = tf.assign(self.char_mat, self.old_char_mat) # get mask & length for words & chars self.c_mask = tf.cast(self.contw_input, tf.bool) self.q_mask = tf.cast(self.quesw_input, tf.bool) self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) # slice for maxlen in each batch self.c_maxlen = tf.reduce_max(self.cont_len) self.q_maxlen = tf.reduce_max(self.ques_len) # elmo features if self.use_elmo == 2: options_file = config['elmo_options_path'] weight_file = config['elmo_weights_path'] bilm = BidirectionalLanguageModel(options_file, weight_file) self.elmo_cont = all_layers(bilm( self.contw_elmo_id)) # [bs, 3, len, 1024] self.elmo_cont = tf.transpose(self.elmo_cont, [0, 2, 1, 3]) # [bs, len, 3, 1024] self.elmo_ques = all_layers(bilm(self.quesw_elmo_id)) self.elmo_ques = tf.transpose(self.elmo_ques, [0, 2, 1, 3]) elif self.use_elmo == 1: self.elmo_cont = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_cont') self.elmo_ques = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_ques') if self.use_cove == 2: with tf.variable_scope('Cove_Layer'): self.cove_model = load_model(config['cove_path']) elif self.use_cove == 1: self.cove_cont = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_cont') self.cove_ques = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_ques') # lr schedule self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.learning_rate = tf.placeholder_with_default( config['learning_rate'], (), name="learning_rate") self.lr = self.learning_rate # self.lr = tf.minimum(self.learning_rate, # self.learning_rate / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) # initial model & complie self.build_model() total_params() self.complie()
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True, demo = False, graph = None): self.config = config self.demo = demo self.graph = graph if graph is not None else tf.Graph() with self.graph.as_default(): self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") if self.demo: self.c = tf.placeholder(tf.int32, [None, config.test_para_limit],"context") self.q = tf.placeholder(tf.int32, [None, config.test_ques_limit],"question") self.ch = tf.placeholder(tf.int32, [None, config.test_para_limit, config.char_limit],"context_char") self.qh = tf.placeholder(tf.int32, [None, config.test_ques_limit, config.char_limit],"question_char") self.y1 = tf.placeholder(tf.int32, [None, config.test_para_limit],"answer_index1") self.y2 = tf.placeholder(tf.int32, [None, config.test_para_limit],"answer_index2") else: self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next() # self.word_unk = tf.get_variable("word_unk", shape = [config.glove_dim], initializer=initializer()) self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( word_mat, dtype=tf.float32), trainable=False) self.char_mat = tf.get_variable( "char_mat", initializer=tf.constant(char_mat, dtype=tf.float32)) self.c_mask = tf.cast(self.c, tf.bool) self.q_mask = tf.cast(self.q, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) if opt: N, CL = config.batch_size if not self.demo else 1, config.char_limit self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen]) self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen]) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit self.ch_len = tf.reshape(tf.reduce_sum( tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape(tf.reduce_sum( tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) self.forward() total_params() if trainable: self.lr = tf.minimum(config.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate = self.lr, beta1 = 0.8, beta2 = 0.999, epsilon = 1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step)
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True, demo = False, graph = None): self.config = config self.demo = demo self.debug_ops = [] self.graph = graph if graph is not None else tf.Graph() with self.graph.as_default(): self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") if self.demo: self.c_elmo = tf.placeholder(tf.int32, [None, config.test_para_limit + 2, 50], "context_elmo_idxs") self.q_elmo = tf.placeholder(tf.int32, [None, config.test_ques_limit + 2, 50], "question_elmo_idxs") self.x_elmo = tf.placeholder(tf.int32, [None, config.cand_limit, 50 + 2], "candidates_elmo_idxs") self.c = tf.placeholder(tf.int32, [None, config.test_para_limit], "context") self.q = tf.placeholder(tf.int32, [None, config.test_ques_limit], "question") self.x = tf.placeholder(tf.int32, [None, config.cand_limit], "candidates") self.ch = tf.placeholder(tf.int32, [None, config.test_para_limit, config.char_limit], "context_char") self.qh = tf.placeholder(tf.int32, [None, config.test_ques_limit, config.char_limit],"question_char") self.xh = tf.placeholder(tf.int32, [None, config.cand_limit, config.char_limit], "candidate_char") self.y1 = tf.placeholder(tf.int32, [None, config.test_para_limit], "answer_index1") self.y2 = tf.placeholder(tf.int32, [None, config.test_para_limit], "answer_index2") else: ''' get_next(): Returns a nested structure of tf.Tensors representing the next element. In graph mode, you should typically call this method once and use its result as the input to another computation. A typical loop will call tf.Session.run on the result of that computation features = tf.parse_single_example(example, features={ "context_idxs": tf.FixedLenFeature([], tf.string), "ques_idxs": tf.FixedLenFeature([], tf.string), "cand_idxs": tf.FixedLenFeature([], tf.string), "context_char_idxs": tf.FixedLenFeature([], tf.string), "ques_char_idxs": tf.FixedLenFeature([], tf.string), "cand_char_idxs": tf.FixedLenFeature([], tf.string), "cand_label": tf.FixedLenFeature([], tf.string), "y1": tf.FixedLenFeature([], tf.string), "y2": tf.FixedLenFeature([], tf.string), "id": tf.FixedLenFeature([], tf.int64) }) c: Tensor("IteratorGetNext:0", shape=(N, 500), dtype=int32) q: Tensor("IteratorGetNext:1", shape=(N, 50), dtype=int32) x: Tensor("IteratorGetNext:2", shape=(N, 50), dtype=int32) ch: Tensor("IteratorGetNext:3", shape=(N, 500, 16), dtype=int32) qh: Tensor("IteratorGetNext:3", shape=(N, 50, 16), dtype=int32) xh: Tensor("IteratorGetNext:4", shape=(N, 50, 16), dtype=int32) yx: Tensor("IteratorGetNext:6", shape=(N, 100), dtype=float32) y1: Tensor("IteratorGetNext:5", shape=(N, 500), dtype=float32) y2: Tensor("IteratorGetNext:6", shape=(N, 500), dtype=float32) qa_id: Tensor("IteratorGetNext:7", shape=(N,), dtype=int64) ''' # batch: train_dataset iterator self.c_elmo, self.q_elmo, self.x_elmo, \ self.c, self.q, self.x, \ self.ch, self.qh, self.xh, \ self.yx, self.xp, self.y1, self.y2, self.qa_id = batch.get_next() if self.config.max_margin: self.yx_inv = 1 - self.yx # TODO self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=False) self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32)) # all initialized to the max_length matrices with zeros --> 1's cover actual lengths self.c_mask = tf.cast(self.c, tf.bool) # Tensor("Cast:0", shape=(N, 500), dtype=bool) self.q_mask = tf.cast(self.q, tf.bool) # Tensor("Cast_1:0", shape=(N, 50), dtype=bool) self.x_mask = tf.cast(self.x, tf.bool) # Tensor("Cast_2:0", shape=(N, 100), dtype=bool) # Tensor("Sum:0", shape=(N,), dtype=int32) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) # Tensor("Sum:0", shape=(N,), dtype=int32) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) # Tensor("Sum:0", shape=(N,), dtype=int32) self.x_len = tf.reduce_sum(tf.cast(self.x_mask, tf.int32), axis=1) ''' tf.slice(input_, begin, size, name=None): extracts a slice of size from a tensor input starting at the location specified by begin. The slice size is represented as a tensor shape, where size[i] is the number of elements of the 'i'th dimension of input that you want to slice. The (begin) for the slice is represented as an offset in each dimension of input. In other words, begin[i] is the offset into the 'i'th dim of input that you want to slice from. ''' # Memory space optimization if opt: N = config.batch_size if not self.demo else 1 CL = config.char_limit self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.x_maxlen = tf.reduce_max(self.x_len) self.c_elmo = tf.slice(self.c_elmo, [0, 0, 0], [N, self.c_maxlen + 2, 50]) # shape=(N, PL, 50) self.q_elmo = tf.slice(self.q_elmo, [0, 0, 0], [N, self.q_maxlen + 2, 50]) # shape=(N, QL, 50) self.x_elmo = tf.slice(self.x_elmo, [0, 0, 0], [N, self.x_maxlen + 2, 50]) # shape=(N, XL, 50) self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) # shape=(N, PL) self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) # shape=(N, QL) self.x = tf.slice(self.x, [0, 0], [N, self.x_maxlen]) # shape=(N, XL) self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) # shape=(N, PL) self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) # shape=(N, QL) self.x_mask = tf.slice(self.x_mask, [0, 0], [N, self.x_maxlen]) # shape=(N, XL) self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) # shape=(N, PL, 16) self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) # shape=(N, QL, 16) self.xh = tf.slice(self.xh, [0, 0, 0], [N, self.x_maxlen, CL]) # shape=(N, XL, 16) self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen]) # shape=(N, PL) self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen]) # shape=(N, PL) self.yx = tf.slice(self.yx, [0, 0], [N, self.x_maxlen]) # shape=(N, XL) if self.config.cand_condense_vector: self.xp = tf.slice(self.xp, [0, 0, 0], [N, self.x_maxlen, self.c_maxlen]) # shape=(N, XL, PL) if self.config.max_margin: self.yx_inv = tf.slice(self.yx_inv, [0, 0], [N, self.x_maxlen]) # shape=(N, x_maxlen) else: self.c_maxlen, self.q_maxlen, self.x_maxlen = config.para_limit, config.ques_limit, config.cand_limit # DEBUG self.debug_ops.extend([self.xp, self.yx, self.y1]) # shape=(N * c_maxlen) self.ch_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) # shape=(N * q_maxlen) self.qh_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) self.forward() total_params() if trainable: self.lr = tf.minimum(config.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate = self.lr,beta1 = 0.8,beta2 = 0.999,epsilon = 1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip) self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step)
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True, demo=False, graph=None): self.config = config self.demo = demo self.graph = graph if graph is not None else tf.Graph() self.trainable = trainable if trainable == True: self.c, self.q, self.ch, self.qh, self.alter, self.alterh, self.y1, self.qa_id = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] else: self.c, self.q, self.ch, self.qh, self.alter, self.alterh, self.qa_id = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.5, (), name="dropout") # self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next() # self.word_unk = tf.get_variable("word_unk", shape = [config.glove_dim], initializer=initializer()) self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( word_mat, dtype=tf.float32), trainable=True) self.char_mat = tf.get_variable("char_mat", initializer=tf.constant( char_mat, dtype=tf.float32), trainable=True) self.c_mask = tf.cast(self.c, tf.bool) #self.c为填充之后的长度是一致的,用0进行填充 self.q_mask = tf.cast(self.q, tf.bool) self.alter1_mask = tf.cast(self.alter[:, 0, :], tf.bool) self.alter2_mask = tf.cast(self.alter[:, 1, :], tf.bool) self.alter3_mask = tf.cast(self.alter[:, 2, :], tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) #表示每一个句子的实际长度 self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) self.alter1_len = tf.reduce_sum(tf.cast(self.alter1_mask, tf.int32), axis=1) self.alter2_len = tf.reduce_sum(tf.cast(self.alter2_mask, tf.int32), axis=1) self.alter3_len = tf.reduce_sum(tf.cast(self.alter3_mask, tf.int32), axis=1) if opt: #此过程会按照batch的最大长度对扩充句子重新缩减 N, CL = config.batch_size if not self.demo else 1, config.char_limit self.c_maxlen = tf.reduce_max(self.c_len) #一个batch中最大的长度 self.q_maxlen = tf.reduce_max(self.q_len) self.aletr1_maxlen = tf.reduce_max(self.alter1_len) self.aletr2_maxlen = tf.reduce_max(self.alter2_len) self.aletr3_maxlen = tf.reduce_max(self.alter3_len) self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) self.alter1 = tf.slice(self.alter[:, 0, :], [0, 0], [N, self.aletr1_maxlen]) self.alter2 = tf.slice(self.alter[:, 1, :], [0, 0], [N, self.aletr2_maxlen]) self.alter3 = tf.slice(self.alter[:, 2, :], [0, 0], [N, self.aletr3_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) self.alter1_mask = tf.slice(self.alter1_mask, [0, 0], [N, self.aletr1_maxlen]) self.alter2_mask = tf.slice(self.alter2_mask, [0, 0], [N, self.aletr2_maxlen]) self.alter3_mask = tf.slice(self.alter3_mask, [0, 0], [N, self.aletr3_maxlen]) self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) self.alter1h = tf.slice(self.alterh[:, 0, :, :], [0, 0, 0], [N, self.aletr1_maxlen, CL]) self.alter2h = tf.slice(self.alterh[:, 1, :, :], [0, 0, 0], [N, self.aletr2_maxlen, CL]) self.alter3h = tf.slice(self.alterh[:, 2, :, :], [0, 0, 0], [N, self.aletr3_maxlen, CL]) # self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen]) # self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen]) else: if trainable: self.c_maxlen, self.q_maxlen, self.alter_maxlen = config.para_limit, config.ques_limit, config.alternatives_limit else: self.c_maxlen, self.q_maxlen, self.alter_maxlen = config.test_para_limit, config.test_ques_limit, config.alternatives_limit self.ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) self.alterh1_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.alter1h, tf.bool), tf.int32), axis=2), [-1]) self.alterh2_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.alter2h, tf.bool), tf.int32), axis=2), [-1]) self.alterh3_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.alter3h, tf.bool), tf.int32), axis=2), [-1]) self.forward() total_params() if trainable: losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits1, labels=self.y1) # losses2 = tf.nn.softmax_cross_entropy_with_logits( # logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses) if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v)) self.lr = tf.minimum( config.init_lr, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
def __init__(self, config, batch, word_mat=None, char_mat=None, filter_sizes=None, embedding_size=None, num_filters=None, trainable=True, opt=True, demo=False, graph=None): self.config = config self.demo = demo self.graph = graph if graph is not None else tf.Graph() self.trainable = trainable self.l2_loss = tf.constant(0.0) self.l2_reg_lambda = 0.7 if trainable == True: self.c, self.q, self.ch, self.qh, self.input_y, self.qa_id, self.alternatives_tokens = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] else: self.c, self.q, self.ch, self.qh, self.alternatives_tokens = batch.get_next( ) # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len] self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.5, (), name="dropout") # self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next() # self.word_unk = tf.get_variable("word_unk", shape = [config.glove_dim], initializer=initializer()) self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( word_mat, dtype=tf.float32), trainable=True) self.char_mat = tf.get_variable("char_mat", initializer=tf.constant( char_mat, dtype=tf.float32), trainable=True) self.c_mask = tf.cast(self.c, tf.bool) #self.c为填充之后的长度是一致的,用0进行填充 self.q_mask = tf.cast(self.q, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) #表示每一个句子的实际长度 self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) if opt: #此过程会按照batch的最大长度对扩充句子重新缩减 N, CL = config.batch_size if not self.demo else 1, config.char_limit self.c_maxlen = tf.reduce_max(self.c_len) #一个batch中最大的长度 self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) # self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen]) # self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen]) else: if trainable: self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit, else: self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit self.ch_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1]) self.qh_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1]) self.forward(trainable) total_params()
def __init__(self, config, word_mat=None, char_mat=None, test=False, use_elmo=False, use_cove=False): # hyper-parameter self.char_dim = config['char_dim'] self.cont_limit = config['cont_limit'] if not test else 1000 self.ques_limit = config['ques_limit'] if not test else 50 self.char_limit = config['char_limit'] self.ans_limit = config['ans_limit'] self.filters = config['filters'] self.num_heads = config['num_heads'] self.batch_size = config['batch_size'] self.l2_norm = config['l2_norm'] self.decay = config['decay'] self.learning_rate = config['learning_rate'] self.grad_clip = config['grad_clip'] self.use_elmo = use_elmo self.use_cove = use_cove self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") # embedding layer self.word_mat = tf.get_variable("word_mat", initializer=tf.constant( word_mat, dtype=tf.float32), trainable=False) self.char_mat = tf.get_variable("char_mat", initializer=tf.constant( char_mat, dtype=tf.float32), trainable=True) # input tensor self.contw_input_ = tf.placeholder(tf.int32, [None, self.cont_limit], "context_word") self.quesw_input_ = tf.placeholder(tf.int32, [None, self.ques_limit], "question_word") self.contc_input_ = tf.placeholder( tf.int32, [None, self.cont_limit, self.char_limit], "context_char") self.quesc_input_ = tf.placeholder( tf.int32, [None, self.ques_limit, self.char_limit], "question_char") self.y_start_ = tf.placeholder(tf.int32, [None, self.cont_limit + 1], "answer_start_index") self.y_end_ = tf.placeholder(tf.int32, [None, self.cont_limit + 1], "answer_end_index") self.contw_strings = tf.placeholder(tf.string, [None, self.cont_limit], 'contw_strings') self.quesw_strings = tf.placeholder(tf.string, [None, self.ques_limit], 'quesw_strings') self.c_mask = tf.cast(self.contw_input_, tf.bool) self.q_mask = tf.cast(self.quesw_input_, tf.bool) self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) if self.use_elmo: elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) self.cont_elmo = elmo(inputs={ "tokens": self.contw_strings, "sequence_len": self.cont_len }, signature="tokens", as_dict=True)["elmo"] self.ques_elmo = elmo(inputs={ "tokens": self.quesw_strings, "sequence_len": self.ques_len }, signature="tokens", as_dict=True)["elmo"] # if self.use_cove: # self.cove_model = load_model('Keras_CoVe_V2.h5') # self.cove_model.trainable = False # slice for maxlen in each batch self.c_maxlen = tf.reduce_max(self.cont_len) self.q_maxlen = tf.reduce_max(self.ques_len) self.contw_input = tf.slice(self.contw_input_, [0, 0], [-1, self.c_maxlen]) self.quesw_input = tf.slice(self.quesw_input_, [0, 0], [-1, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [-1, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [-1, self.q_maxlen]) self.contc_input = tf.slice(self.contc_input_, [0, 0, 0], [-1, self.c_maxlen, self.char_limit]) self.quesc_input = tf.slice(self.quesc_input_, [0, 0, 0], [-1, self.q_maxlen, self.char_limit]) self.y_start = tf.slice(self.y_start_, [0, 0], [-1, self.c_maxlen + 1]) self.y_end = tf.slice(self.y_end_, [0, 0], [-1, self.c_maxlen + 1]) if self.use_elmo: self.cont_elmo = tf.slice(self.cont_elmo, [0, 0, 0], [-1, self.c_maxlen, 1024]) self.ques_elmo = tf.slice(self.ques_elmo, [0, 0, 0], [-1, self.q_maxlen, 1024]) # init model & complie self.build_model() total_params() self.complie()
def __init__(self, config, iterator, emb_mat, trainable=True, opt=True, demo=False): self.config = config self.emb_mat = emb_mat self.demo = demo self.global_step = tf.get_variable( 'global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") if self.demo: self.c = tf.placeholder(tf.int32, [None, config.test_x_limit], "context") self.q = tf.placeholder(tf.int32, [None, config.test_q_limit], "query") self.y = tf.placeholder(tf.float32, [None], "y") self.batch_size = tf.placeholder(tf.int32, None, "batch_size") else: self.c, self.q, self.y = iterator.get_next() with tf.variable_scope("opt"): self.c_mask = tf.cast(self.c, tf.bool) self.q_mask = tf.cast(self.q, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) if opt: self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c, [0, 0], [-1, self.c_maxlen]) self.q = tf.slice(self.q, [0, 0], [-1, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [-1, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [-1, self.q_maxlen]) else: self.c_maxlen, self.q_maxlen = config.x_limit, config.q_limit self._build_model() if not config.cudnn: total_params() if trainable: if config.l2_norm: regularizer = tf.contrib.layers.l2_regularizer(config.l2_norm) variables = tf.trainable_variables() variables = [v for v in variables if "bias" not in v.name] # don't regularize bias self.l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += self.l2_loss # self.loss -= self.l2_loss # optimizer self.lr = tf.placeholder_with_default(0.001, (), name="lr") self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.lr) if config.grad_clip_flag: grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm( gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step) else: self.train_op = self.opt.minimize(self.loss, global_step=self.global_step) # ema if config.decay: self.ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.ema.apply(tf.trainable_variables()) with tf.control_dependencies([self.train_op]): self.train_op = tf.group(ema_op)
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=False, demo=False, graph=None): self.config = config self.demo = demo self.graph = graph if graph is not None else tf.Graph() with self.graph.as_default(): self.global_step = tf.get_variable( name='global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) self.dropout = tf.placeholder_with_default(input=0.0, shape=[], name='dropout') # Model Input if self.demo: self.c = tf.placeholder(shape=[None, config.test_para_limit], name='context', dtype=tf.int32) self.q = tf.placeholder(shape=[None, config.test_ques_limit], name='question', dtype=tf.int32) self.ch = tf.placeholder(shape=[None, config.test_para_limit, config.char_limit], name='context_char', dtype=tf.int32) self.qh = tf.placeholder(shape=[None, config.test_ques_limit, config.char_limit], name='question_char', dtype=tf.int32) self.ans = tf.placeholder(shape=[None, config.test_para_limit], name='answer', dtype=tf.int32) self.cans = tf.placeholder(shape=[None, config.num_cans, config.test_para_limit], name='candidates', dtype=tf.int32) self.y_true = tf.placeholder(shape=[None, config.num_cans], name='y_true', dtype=tf.int32) else: self.c, self.q, self.ch, self.qh, self.ans, self.cans, self.y_true = batch.get_next() self.word_mat = tf.get_variable( # pre-trained word embeddings name='word_mat', initializer=tf.constant(word_mat, dtype=tf.float32), trainable=False) self.char_mat = tf.get_variable( # trainable char embeddings name='char_mat', initializer=tf.constant(char_mat, dtype=tf.float32)) self.c_mask = tf.cast(self.c, tf.bool) # [batch_size, c_maxlen] self.q_mask = tf.cast(self.q, tf.bool) # [batch_size, q_maxlen] self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) # [batch_size] self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) # [batch_size] if opt: N, CL = config.batch_size if not self.demo else 1, config.char_limit self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen]) self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen]) self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL]) self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL]) self.ans = tf.slice(self.ans, [0, 0], [N, self.c_maxlen]) self.cans = tf.slice(self.cans, [0, 0, 0], [N, config.num_cans, self.c_maxlen]) # not needed self.y_true = tf.slice(self.y_true, [0, 0], [N, config.num_cans]) else: self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit self.ch_len = tf.reshape( tf.reduce_sum(tf.cast( tf.cast(self.ch, tf.bool), tf.int32), axis=2), shape=[-1]) self.qn_len = tf.reshape( tf.reduce_sum(tf.cast( tf.cast(self.qh, tf.bool), tf.int32), axis=2), shape=[-1]) self.forward() total_params() if trainable: self.lr = tf.minimum(config.learning_rate, 0.001 / tf.log(999.0) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) grads = self.opt.compute_gradients(self.loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip) self.train_op = self.opt.apply_gradients( zip(capped_grads, variables), global_step=self.global_step)