def filter_data(self, only_sent=False, fine_grained=False): _logger.add() _logger.add('filtering data for %s, only sentence: %s' % (self.data_type, only_sent)) if only_sent: counter = 0 new_nn_data = [] for trees in self.nn_data: new_trees = [] for tree in trees: if tree['is_sent']: new_trees.append(tree) counter += 1 new_nn_data.append(new_trees) self.nn_data = new_nn_data self.sample_num = counter if not fine_grained: # delete the neutral sample counter = 0 new_nn_data = [] for trees in self.nn_data: new_trees = [] for tree in trees: sent_label = tree['root_node']['sentiment_label'] if sent_label <= 0.4 or sent_label > 0.6: counter += 1 new_trees.append(tree) new_nn_data.append(new_trees) self.nn_data = new_nn_data self.sample_num = counter
def gene_sub_trees_and_shift_reduce_info(self): _logger.add() _logger.add('generating sub-trees and shift reduce info: %s...' % self.data_type) counter = 0 new_data_list = [] for tree in self.digitized_data_list: sub_trees = [] idx_to_node_dict = dict( (tree_node['node_index'], tree_node) for tree_node in tree) for tree_node in tree: # get all node for a sub tree if tree_node['is_leaf']: new_sub_tree = [tree_node] else: new_sub_tree = [] new_sub_tree_leaves = [ idx_to_node_dict[node_index] for node_index in tree_node['leaf_node_index_seq'] ] new_sub_tree += new_sub_tree_leaves for leaf_node in new_sub_tree_leaves: pre_node = leaf_node while pre_node[ 'parent_index'] > 0 and pre_node != tree_node: # fixme cur_node = idx_to_node_dict[ pre_node['parent_index']] if cur_node not in new_sub_tree: new_sub_tree.append(cur_node) pre_node = cur_node # get shift reduce info child_node_indices = [ new_tree_node['node_index'] for new_tree_node in new_sub_tree ] parent_node_indices = [ new_tree_node['parent_index'] if new_tree_node['parent_index'] in child_node_indices else 0 for new_tree_node in new_sub_tree ] sr_result = shift_reduce_constituency_forest( list(zip(child_node_indices, parent_node_indices))) operation_list, node_list_in_stack, reduce_mat = zip( *sr_result) shift_reduce_info = { 'op_list': operation_list, 'reduce_mat': reduce_mat, 'node_list_in_stack': node_list_in_stack } sub_tree = { 'tree_nodes': new_sub_tree, 'shift_reduce_info': shift_reduce_info, 'root_node': tree_node, 'is_sent': True if tree_node['parent_index'] == 0 else False } sub_trees.append(sub_tree) counter += 1 new_data_list.append(sub_trees) return new_data_list, counter
def __init__(self, data_list, data_type, dicts=None): self.data_type = data_type _logger.add('building data set object for %s' % data_type) assert data_type in ['train', 'dev', 'test'] # check if data_type in ['dev', 'test']: assert dicts is not None processed_data_list = self.process_raw_data(data_list, data_type) if data_type == 'train': self.dicts, self.max_lens = self.count_data_and_build_dict( processed_data_list) else: _, self.max_lens = self.count_data_and_build_dict( processed_data_list, False) self.dicts = dicts self.digitized_data_list = self.digitize_dataset( processed_data_list, self.dicts, data_type) self.nn_data, self.sample_num = self.gene_sub_trees_and_shift_reduce_info( ) self.emb_mat_token, self.emb_mat_glove = None, None if data_type == 'train': self.emb_mat_token, self.emb_mat_glove = self.generate_index2vec_matrix( )
def __init__(self, train_file_path, dev_file_path=None, test_file_path=None): _logger.add('building data set object') train_data_list = self.load_data(train_file_path, 'train') dev_data_list = self.load_data(dev_file_path, 'dev') if test_file_path != None: test_data_list = self.load_data(test_file_path, 'test') data_list = [] data_list.extend(train_data_list) data_list.extend(dev_data_list) if test_file_path != None: data_list.extend(test_data_list) self.dicts, self.max_lens = self.count_data_and_build_dict(data_list) self.digitized_train_data_list = self.digitize_dataset( train_data_list, self.dicts) self.digitized_dev_data_list = self.digitize_dataset( dev_data_list, self.dicts) if test_file_path != None: self.digitized_test_data_list = self.digitize_dataset( test_data_list, self.dicts) self.emb_mat_token, self.emb_mat_glove = self.generate_index2vec_matrix( )
def digitize_dataset(dataset, dicts, data_type): token2index = dict([ (token, idx) for idx, token in enumerate(dicts['token'] + dicts['glove']) ]) def digitize_token(token): token = token if not cfg.lower_word else token.lower() try: return token2index[token] except KeyError: return 1 _logger.add() _logger.add('digitizing data: %s...' % data_type) for topic in tqdm(dataset): for paragraph in topic['paragraphs']: paragraph['context_token_digital'] = [[ digitize_token(token) for token in sent ] for sent in paragraph['context_token']] for qa in paragraph['qas']: qa['question_token_digital'] = [ digitize_token(token) for token in qa['question_token'] ] _logger.done() return dataset
def update_tensor_add_ema_and_opt(self): self.logits, (self.s1_act, self.s1_logpa), (self.s2_act, self.s2_logpa), \ (self.s1_percentage, self.s2_percentage) = self.build_network() self.loss_sl, self.loss_rl = self.build_loss() self.accuracy = self.build_accuracy() # ------------ema------------- if True: self.var_ema = tf.train.ExponentialMovingAverage(cfg.var_decay) self.build_var_ema() if cfg.mode == 'train': self.ema = tf.train.ExponentialMovingAverage(cfg.decay) self.build_ema() self.summary = tf.summary.merge_all() # ---------- optimization --------- if cfg.optimizer.lower() == 'adadelta': assert cfg.learning_rate > 0.1 and cfg.learning_rate < 1. self.opt_sl = tf.train.AdadeltaOptimizer(cfg.learning_rate) self.opt_rl = tf.train.AdadeltaOptimizer(cfg.learning_rate) elif cfg.optimizer.lower() == 'adam': assert cfg.learning_rate < 0.1 self.opt_sl = tf.train.AdamOptimizer(cfg.learning_rate) self.opt_rl = tf.train.AdamOptimizer(cfg.learning_rate) elif cfg.optimizer.lower() == 'rmsprop': assert cfg.learning_rate < 0.1 self.opt_sl = tf.train.RMSPropOptimizer(cfg.learning_rate) self.opt_rl = tf.train.RMSPropOptimizer(cfg.learning_rate) else: raise AttributeError('no optimizer named as \'%s\'' % cfg.optimizer) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) # trainable param num: # print params num all_params_num = 0 for elem in trainable_vars: # elem.name var_name = elem.name.split(':')[0] if var_name.endswith('emb_mat'): continue params_num = 1 for l in elem.get_shape().as_list(): params_num *= l all_params_num += params_num _logger.add('Trainable Parameters Number: %d' % all_params_num) sl_vars = [var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) if not var.op.name.startswith(self.scope+'/hard_network')] self.train_op_sl = self.opt_sl.minimize( self.loss_sl, self.global_step, var_list=sl_vars) rl_vars = [var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) if var.op.name.startswith(self.scope + '/hard_network')] if len(rl_vars) > 0: self.train_op_rl = self.opt_rl.minimize( self.loss_rl, var_list=rl_vars) else: self.train_op_rl = None
def digitize_data(self, data_list, dicts, dataset_type): token2index = dict([ (token, idx) for idx, token in enumerate(dicts['token'] + dicts['glove']) ]) char2index = dict([(token, idx) for idx, token in enumerate(dicts['char'])]) def digitize_token(token): token = token if not cfg.lower_word else token.lower() try: return token2index[token] except KeyError: return 1 def digitize_char(char): try: return char2index[char] except KeyError: return 1 _logger.add() _logger.add('digitizing data: %s...' % dataset_type) for sample in data_list: sample['token_digital'] = [ digitize_token(token) for token in sample['token'] ] sample['char_digital'] = [[ digitize_char(char) for char in list(token) ] for token in sample['token']] _logger.done() return data_list
def process_raw_dataset(raw_data, data_type): _logger.add() _logger.add('processing raw data: %s...' % data_type) for topic in tqdm(raw_data): for paragraph in topic['paragraphs']: # context paragraph['context'] = paragraph['context'].replace( "''", '" ').replace("``", '" ') paragraph['context_token'] = [[ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(sent) ] for sent in nltk.sent_tokenize(paragraph['context'])] paragraph['context_token'] = [ Dataset.further_tokenize(sent) for sent in paragraph['context_token'] ] # qas for qa in paragraph['qas']: qa['question'] = qa['question'].replace("''", '" ').replace( "``", '" ') qa['question_token'] = Dataset.further_tokenize([ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(qa['question']) ]) # # tag generation for answer in qa['answers']: answer[ 'sent_label'] = Dataset.sentence_label_generation( paragraph['context'], paragraph['context_token'], answer['text'], answer['answer_start']) _logger.done() return raw_data
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq) # bs,sl,tel self.tensor_dict['emb'] = emb rep = disan( emb, self.token_mask, 'DiSAN', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='') with tf.variable_scope('output'): pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # bs, hn logits = linear([pre_logits], self.output_class, False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5 _logger.done() return logits
def generate_tree_shift_reduce_info(self, dataset, data_type): _logger.add() _logger.add('generating tree shift reduce for %s' % data_type) for sample in dataset: # sent1 s1_child_parent_node_indices = [ (new_tree_node.node_index, new_tree_node.parent_index) for new_tree_node in sample['sentence1_binary_parse_node_list'] ] s1_sr = shift_reduce_constituency_forest( s1_child_parent_node_indices) s1_op_list, s1_node_list_in_stack, s1_reduce_mat = zip(*s1_sr) s1_sr_info = { 'op_list': s1_op_list, 'reduce_mat': s1_reduce_mat, 'node_list_in_stack': s1_node_list_in_stack } sample['s1_sr_info'] = s1_sr_info # tree tag # s1_tree_tag = [] # for node_idx in s1_node_list_in_stack: # ### find tree node # tree_node_found = None # for tree_node in sample['sentence1_parse_node_list']: # if tree_node.node_index == node_idx: # tree_node_found = tree_node # break # assert tree_node_found is not None # s1_tree_tag.append(tree_node_found.tag) # sample['s1_tree_tag'] = s1_tree_tag # s2 s2_child_parent_node_indices = [ (new_tree_node.node_index, new_tree_node.parent_index) for new_tree_node in sample['sentence2_binary_parse_node_list'] ] s2_sr = shift_reduce_constituency_forest( s2_child_parent_node_indices) s2_op_list, s2_node_list_in_stack, s2_reduce_mat = zip(*s2_sr) s2_sr_info = { 'op_list': s2_op_list, 'reduce_mat': s2_reduce_mat, 'node_list_in_stack': s2_node_list_in_stack } sample['s2_sr_info'] = s2_sr_info # # tree tag # s2_tree_tag = [] # for node_idx in s2_node_list_in_stack: # ### find tree node # tree_node_found = None # for tree_node in sample['sentence2_parse_node_list']: # if tree_node.node_index == node_idx: # tree_node_found = tree_node # break # assert tree_node_found is not None # s2_tree_tag.append(tree_node_found.tag) # sample['s2_tree_tag'] = s2_tree_tag return dataset
def __init__(self, data_file_path, data_type, dicts=None): self.data_type = data_type _logger.add('building data set object for %s' % data_type) assert data_type in ['train', 'dev', 'test'] # check if data_type in ['dev', 'test']: assert dicts is not None # temporary params self.only_bi_tree = True raw_data = self.load_snli_data(data_file_path, data_type) data_with_tree = self.transform_str_to_tree(raw_data, data_type) # data_with_tree = self.generate_tree_shift_reduce_info(data_with_tree, data_type) processed_data_list = self.process_raw_data(data_with_tree, data_type) if data_type == 'train': self.dicts, self.max_lens = self.count_data_and_build_dict( processed_data_list) else: _, self.max_lens = self.count_data_and_build_dict( processed_data_list, False) self.dicts = dicts digital_data = self.digitize_data(processed_data_list, self.dicts, data_type) self.nn_data = self.clip_filter_data(digital_data, cfg.data_clip_method, data_type) self.sample_num = len(self.nn_data) if data_type == 'train': self.emb_mat_token, self.emb_mat_glove = self.generate_index2vec_matrix( )
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs = self.bs with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq) # bs,sl1,tel with tf.variable_scope('sent_encoding'): rep = sentence_encoding_models( emb, self.token_mask, cfg.context_fusion_method, 'relu', 'ct_based_sent2vec', cfg.wd, self.is_train, cfg.dropout, block_len=cfg.block_len) with tf.variable_scope('output'): pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # bs, hn logits = linear([pre_logits], self.output_class, False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5 _logger.done() return logits
def build_loss(self): # weight_decay with tf.name_scope("weight_decay"): for var in set(tf.get_collection('reg_vars', self.scope)): weight_decay = tf.multiply(tf.nn.l2_loss(var), cfg.wd, name="{}-wd".format('-'.join( str(var.op.name).split('/')))) tf.add_to_collection('losses', weight_decay) reg_vars = tf.get_collection('losses', self.scope) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) _logger.add('regularization var num: %d' % len(reg_vars)) _logger.add('trainable var num: %d' % len(trainable_vars)) target_dist = tf.clip_by_value(self.target_distribution, 1e-10, 1.) predicted_dist = tf.clip_by_value(tf.nn.softmax(self.logits), 1e-10, 1.) kl_batch = tf.reduce_sum( target_dist * tf.log(target_dist / predicted_dist), -1) # kl_batch = tf.reduce_sum((target_dist - predicted_dist) ** 2, -1) tf.add_to_collection( 'losses', tf.reduce_mean(kl_batch, name='kl_divergence_mean')) loss = tf.add_n(tf.get_collection('losses', self.scope), name='loss') tf.summary.scalar(loss.op.name, loss) tf.add_to_collection('ema/scalar', loss) return loss
def process_raw_data(self, dataset, data_type): def further_tokenize(temp_tokens): tokens = [] # [[(s,e),...],...] for token in temp_tokens: l = ( "-", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0") tokens.extend(re.split("([{}])".format("".join(l)), token)) return tokens # tokens _logger.add() _logger.add('processing raw data for %s' % data_type) for sample in tqdm(dataset): sample['sentence1_token'] = [node.token for node in sample['sentence1_binary_parse_node_list'] if node.is_leaf] sample['sentence1_tag'] = [node.tag for node in sample['sentence1_binary_parse_node_list'] if node.is_leaf] sample['sentence2_token'] = [node.token for node in sample['sentence2_binary_parse_node_list'] if node.is_leaf] sample['sentence2_tag'] = [node.tag for node in sample['sentence2_binary_parse_node_list'] if node.is_leaf] if cfg.data_clip_method == 'no_tree': sample['sentence1_token'] = further_tokenize(sample['sentence1_token']) sample['sentence2_token'] = further_tokenize(sample['sentence2_token']) _logger.done() return dataset
def load_data_pickle(self, data_file_path, data_type): _logger.add() _logger.add('load file for %s' % data_type) dataset = None with open(data_file_path, 'rb', encoding='utf-8') as file: dataset = pickle.load(file) _logger.done() return dataset
def load_data(self, data_file_path, data_type): _logger.add() _logger.add('load file for %s' % data_type) dataset = [] with open(data_file_path, 'r', encoding='utf-8') as f: dataset = json.load(f) _logger.done() return dataset
def transform_str_to_tree(self, dataset, data_type): _logger.add() _logger.add('transforming str format tree into real tree for %s' % data_type) for sample in tqdm(dataset): sample['sentence1_binary_parse_tree'] = recursive_build_binary( tokenize_str_format_tree(sample['sentence1_binary_parse'])) sample['sentence2_binary_parse_tree'] = recursive_build_binary( tokenize_str_format_tree(sample['sentence2_binary_parse'])) # sample['sentence1_parse_tree'] = recursive_build_penn_format( # tokenize_str_format_tree(sample['sentence1_parse'])) # sample['sentence2_parse_tree'] = recursive_build_penn_format( # tokenize_str_format_tree(sample['sentence2_parse'])) # to node_list sample['sentence1_binary_parse_tree'], sample['sentence1_binary_parse_node_list'] = \ transform_tree_to_parent_index(sample['sentence1_binary_parse_tree']) sample['sentence2_binary_parse_tree'], sample['sentence2_binary_parse_node_list'] = \ transform_tree_to_parent_index(sample['sentence2_binary_parse_tree']) # sample['sentence1_parse_tree'], sample['sentence1_parse_node_list'] = \ # transform_tree_to_parent_index(sample['sentence1_parse_tree']) # sample['sentence2_parse_tree'], sample['sentence2_parse_node_list'] = \ # transform_tree_to_parent_index(sample['sentence2_parse_tree']) # shift reduce info # # s1 s1_child_parent_node_indices = [ (new_tree_node.node_index, new_tree_node.parent_index) for new_tree_node in sample['sentence1_binary_parse_node_list'] ] s1_sr = shift_reduce_constituency_forest( s1_child_parent_node_indices) s1_op_list, s1_node_list_in_stack, s1_reduce_mat = zip(*s1_sr) s1_sr_info = { 'op_list': s1_op_list, 'reduce_mat': s1_reduce_mat, 'node_list_in_stack': s1_node_list_in_stack } sample['s1_sr_info'] = s1_sr_info # # s2 s2_child_parent_node_indices = [ (new_tree_node.node_index, new_tree_node.parent_index) for new_tree_node in sample['sentence2_binary_parse_node_list'] ] s2_sr = shift_reduce_constituency_forest( s2_child_parent_node_indices) s2_op_list, s2_node_list_in_stack, s2_reduce_mat = zip(*s2_sr) s2_sr_info = { 'op_list': s2_op_list, 'reduce_mat': s2_reduce_mat, 'node_list_in_stack': s2_node_list_in_stack } sample['s2_sr_info'] = s2_sr_info _logger.done() return dataset
def get_evaluation(self, sess, dataset_obj, global_step=None): _logger.add() _logger.add('getting evaluation result for %s' % dataset_obj.data_type) logits_list, loss_list, accu_list = [], [], [] is_sent_list = [] for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter(): feed_dict = self.model.get_feed_dict(sample_batch, 'dev') logits, loss, accu = sess.run( [self.model.logits, self.model.loss, self.model.accuracy], feed_dict) logits_list.append(np.argmax(logits, -1)) loss_list.append(loss) accu_list.append(accu) is_sent_list += [sample['is_sent'] for sample in sample_batch] logits_array = np.concatenate(logits_list, 0) loss_value = np.mean(loss_list) accu_array = np.concatenate(accu_list, 0) accu_value = np.mean(accu_array) sent_accu_list = [] for idx, is_sent in enumerate(is_sent_list): if is_sent: sent_accu_list.append(accu_array[idx]) sent_accu_value = np.mean(sent_accu_list) # analysis # analysis_save_dir = cfg.mkdir(cfg.answer_dir,'gs_%s'%global_step or 'test') # OutputAnalysis.do_analysis(dataset_obj, logits_array, accu_array, analysis_save_dir, # cfg.fine_grained) # add summary if global_step is not None: if dataset_obj.data_type == 'train': summary_feed_dict = { self.train_loss: loss_value, self.train_accuracy: accu_value, self.train_sent_accuracy: sent_accu_value, } summary = sess.run(self.train_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) elif dataset_obj.data_type == 'dev': summary_feed_dict = { self.dev_loss: loss_value, self.dev_accuracy: accu_value, self.dev_sent_accuracy: sent_accu_value, } summary = sess.run(self.dev_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) else: summary_feed_dict = { self.test_loss: loss_value, self.test_accuracy: accu_value, self.test_sent_accuracy: sent_accu_value, } summary = sess.run(self.test_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) return loss_value, accu_value, sent_accu_value
def get_evaluation(self, sess, dataset_obj, global_step=None, time_counter=None): _logger.add() _logger.add('getting evaluation result for %s' % dataset_obj.data_type) logits_list, loss_list, accu_list = [], [], [] for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter(): feed_dict = self.model.get_feed_dict(sample_batch, 'dev') if time_counter is not None: time_counter.add_start() logits, loss, accu = sess.run([ self.model.logits, self.model.xentropy_loss, self.model.accuracy ], feed_dict) if time_counter is not None: time_counter.add_stop() logits_list.append(np.argmax(logits, -1)) loss_list.append(loss) accu_list.append(accu) logits_array = np.concatenate(logits_list, 0) loss_value = np.mean(loss_list) accu_array = np.concatenate(accu_list, 0) accu_value = np.mean(accu_array) # todo: analysis # analysis_save_dir = cfg.mkdir(cfg.answer_dir, 'gs_%d' % global_step or 0) # OutputAnalysis.do_analysis(dataset_obj, logits_array, accu_array, analysis_save_dir, # cfg.fine_grained) if global_step is not None: if dataset_obj.data_type == 'train': summary_feed_dict = { self.train_loss: loss_value, self.train_accuracy: accu_value, } summary = sess.run(self.train_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) elif dataset_obj.data_type == 'dev': summary_feed_dict = { self.dev_loss: loss_value, self.dev_accuracy: accu_value, } summary = sess.run(self.dev_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) else: summary_feed_dict = { self.test_loss: loss_value, self.test_accuracy: accu_value, } summary = sess.run(self.test_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) return loss_value, accu_value
def count_data_and_build_dict(self, data_list, gene_dicts=True): def add_ept_and_unk(a_list): a_list.insert(0, '@@@empty') a_list.insert(1, '@@@unk') return a_list _logger.add() _logger.add('counting and build dictionaries') token_collection = [] char_collection = [] sent_len_collection = [] token_len_collection = [] for sample in data_list: token_collection += sample['sentence_token'] sent_len_collection += [len(sample['sentence_token'])] for token in sample['sentence_token']: char_collection += list(token) token_len_collection.append(len(token)) max_sent_len = max(sent_len_collection) max_token_len = max(token_len_collection) token_set = list(set(token_collection)) char_set = list(set(char_collection)) if gene_dicts: if cfg.use_glove_unk_token: glove_data = load_glove(cfg.word_embedding_length) glove_token_set = list(glove_data.keys()) if cfg.lower_word: token_set = list(token.lower() for token in token_set) glove_token_set = list( set(token.lower() for token in glove_token_set)) # delete token from glove_token_set which appears in token_set for token in token_set: try: glove_token_set.remove(token) except ValueError: pass else: if cfg.lower_word: token_set = list(token.lower() for token in token_set) glove_token_set = [] token_set = add_ept_and_unk(token_set) char_set = add_ept_and_unk(char_set) dicts = { 'token': token_set, 'char': char_set, 'glove': glove_token_set } return dicts, {'sent': max_sent_len, 'token': max_token_len}
def load_snli_data(self, data_path, data_type): _logger.add() _logger.add('load file for %s' % data_type) dataset = [] with open(data_path, 'r', encoding='utf-8') as file: for line in file: json_obj = json.loads(line) dataset.append(json_obj) _logger.done() return dataset
def process_raw_data(self, data_list, data_type): _logger.add() _logger.add('processing raw data: %s...' % data_type) for sample in data_list: for tree_node in sample: # node_index, parent_index, token_seq, leaf_node_index_seq, is_leaf, token, sentiment_label # char_seq tree_node['char_seq'] = [list(token) for token in tree_node['token_seq']] _logger.done() return data_list
def count_data_and_build_dict(self, data_list, gene_dicts=True): def add_ept_and_unk(a_list): a_list.insert(0, '@@@empty') a_list.insert(1, '@@@unk') return a_list _logger.add() _logger.add('counting and build dictionaries') token_collection = [] char_collection = [] sent_len_collection = [] token_len_collection = [] for sample in data_list: for tree_node in sample: token_collection += tree_node['token_seq'] sent_len_collection.append(len(tree_node['token_seq'])) for char_seq in tree_node['char_seq']: char_collection += char_seq token_len_collection.append(len(char_seq)) max_sent_len = dynamic_length(sent_len_collection, 1, security=False)[0] max_token_len = dynamic_length(token_len_collection, 0.99, security=False)[0] if gene_dicts: # token & char tokenSet = dynamic_keep(token_collection, 1) charSet = dynamic_keep(char_collection, 1) if cfg.use_glove_unk_token: gloveData = load_glove(cfg.word_embedding_length) gloveTokenSet = list(gloveData.keys()) if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) ##!!! gloveTokenSet = list(set([token.lower() for token in gloveTokenSet])) ##!!! # delete token from gloveTokenSet which appears in tokenSet for token in tokenSet: try: gloveTokenSet.remove(token) except ValueError: pass else: if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) gloveTokenSet = [] tokenSet = add_ept_and_unk(tokenSet) charSet = add_ept_and_unk(charSet) dicts = {'token': tokenSet, 'char': charSet, 'glove': gloveTokenSet} else: dicts = {} _logger.done() return dicts, {'sent': max_sent_len, 'token': max_token_len}
def get_evaluation(self, sess, dataset_obj, global_step=None, time_counter=None): _logger.add() _logger.add('getting evaluation result for %s' % dataset_obj.data_type) logits_list, loss_list, accu_list = [], [], [] for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter(): feed_dict = self.model.get_feed_dict(sample_batch, 'dev') if time_counter is not None: time_counter.add_start() logits, loss, accu = sess.run( [self.model.logits, self.model.loss, self.model.accuracy], feed_dict) if time_counter is not None: time_counter.add_stop() logits_list.append(np.argmax(logits, -1)) loss_list.append(loss) accu_list.append(accu) logits_array = np.concatenate(logits_list, 0) loss_value = np.mean(loss_list) # accu_array = np.concatenate(accu_list, 0) # accu_value = np.mean(accu_array) # calculate accuracy correct_list = [] for qa, predicted_label in zip(dataset_obj.nn_data, list(logits_array)): correct = 0. for ans in qa['answers']: if int(predicted_label) == int(ans['sent_label']): correct = 1. correct_list.append(correct) accu_value = np.mean(correct_list) if global_step is not None: if dataset_obj.data_type == 'train': summary_feed_dict = { self.train_loss: loss_value, self.train_accuracy: accu_value, } summary = sess.run(self.train_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) elif dataset_obj.data_type == 'dev': summary_feed_dict = { self.dev_loss: loss_value, self.dev_accuracy: accu_value, } summary = sess.run(self.dev_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) return loss_value, accu_value
def update_tensor_add_ema_and_opt(self): self.logits = self.build_network() self.loss = self.build_loss() self.accuracy = self.build_accuracy() # ------------ema------------- if True: self.var_ema = tf.train.ExponentialMovingAverage(cfg.var_decay) self.build_var_ema() if cfg.mode == 'train': self.ema = tf.train.ExponentialMovingAverage(cfg.decay) self.build_ema() self.summary = tf.summary.merge_all() # ---------- optimization --------- if cfg.optimizer.lower() == 'adadelta': assert cfg.learning_rate > 0.1 and cfg.learning_rate <= 1. self.opt = tf.train.AdadeltaOptimizer(cfg.learning_rate) elif cfg.optimizer.lower() == 'adam': assert cfg.learning_rate < 0.1 self.opt = tf.train.AdamOptimizer(cfg.learning_rate) elif cfg.optimizer.lower() == 'rmsprop': assert cfg.learning_rate < 0.1 self.opt = tf.train.RMSPropOptimizer(cfg.learning_rate) elif cfg.optimizer.lower() == 'test': self.opt = tf.train.RMSPropOptimizer(0.001, 0.75) # self.opt = tf.contrib.keras.optimizers.Nadam() else: raise AttributeError('no optimizer named as \'%s\'' % cfg.optimizer) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) # trainable param num: # print params num all_params_num = 0 for elem in trainable_vars: # elem.name var_name = elem.name.split(':')[0] if var_name.endswith('emb_mat'): continue params_num = 1 for l in elem.get_shape().as_list(): params_num *= l all_params_num += params_num _logger.add('Trainable Parameters Number: %d' % all_params_num) self.train_op = self.opt.minimize(self.loss, self.global_step, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, self.scope))
def build_loss(self): _logger.add('regularization var num: %d' % len(set(tf.get_collection('reg_vars', self.scope)))) _logger.add('trainable var num: %d' % len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope))) # weight_decay with tf.name_scope("weight_decay"): for var in set(tf.get_collection('reg_vars', self.scope)): tensor_name = var.op.name weight_decay = tf.multiply(tf.nn.l2_loss(var), cfg.wd, name="{}-wd".format('-'.join(str(var.op.name).split('/')))) if not tensor_name.startswith(self.scope+'/hard_network'): tf.add_to_collection('losses_sl', weight_decay) if tensor_name.startswith(self.scope+'/hard_network'): tf.add_to_collection('losses_rl', weight_decay) a = tf.get_collection('losses_sl', self.scope) b = tf.get_collection('losses_rl', self.scope) cost_batch = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.gold_label, logits=self.logits ) # [bs] # @ 1. for normal cost_sl = tf.reduce_mean(cost_batch, name='cost_sl') tf.add_to_collection('losses_sl', cost_sl) loss_sl = tf.add_n(tf.get_collection('losses_sl', self.scope), name='loss_sl') tf.summary.scalar(loss_sl.op.name, loss_sl) tf.add_to_collection('ema/scalar', loss_sl) # @ 2. for rl self.choose_percentage = tf.reduce_mean(tf.stack([self.s1_percentage, self.s2_percentage]), name='choose_percentage') tf.summary.scalar(self.choose_percentage.op.name, self.choose_percentage) tf.add_to_collection('ema/scalar', self.choose_percentage) # # loss_rl s1_rewards_raw = - (cost_batch + cfg.rl_sparsity * self.s1_percentage) s2_rewards_raw = - (cost_batch + cfg.rl_sparsity * self.s2_percentage) self.reward_mean = tf.reduce_mean(tf.stack([s1_rewards_raw, s2_rewards_raw]), name='reward_mean') tf.summary.scalar(self.reward_mean.op.name, self.reward_mean) tf.add_to_collection('ema/scalar', self.reward_mean) cost_rl = - tf.reduce_mean( s1_rewards_raw * tf.reduce_sum(self.s1_logpa, 1) + s2_rewards_raw * tf.reduce_sum(self.s2_logpa, 1), name='cost_rl') tf.add_to_collection('losses_rl', cost_rl) loss_rl = tf.add_n(tf.get_collection('losses_rl', self.scope), name='loss_rl') tf.summary.scalar(loss_rl.op.name, loss_rl) tf.add_to_collection('ema/scalar', loss_rl) return loss_sl, loss_rl
def load_question_classification_data(self, data_file_path, data_type): _logger.add() _logger.add('load file for %s' % data_type) dataset = [] with open(data_file_path, 'r', encoding='latin-1') as file: for line in file: line_split = line.strip().split(' ') cls = line_split[0].split(':')[0] sub_cls = line_split[0] token = line_split[1:] sample = {'token': token, 'cls': cls, 'sub_cls': sub_cls} dataset.append(sample) _logger.done() return dataset
def output_model_params(): _logger.add() _logger.add('==>model_title: ' + cfg.model_name[1:]) _logger.add() for key, value in cfg.args.__dict__.items(): if key not in ['test', 'shuffle']: _logger.add('%s: %s' % (key, value))
def clip_filter_data(self, data_list, data_clip_method, data_type): _logger.add() _logger.add('%s cliping data for %s...' % (data_clip_method, data_type)) for sample in data_list: if data_clip_method == 'no_tree': sample.pop('sentence1_parse') sample.pop('sentence2_parse') # sample.pop('sentence1_parse_tree') # sample.pop('sentence2_parse_tree') # sample.pop('sentence1_parse_node_list') # sample.pop('sentence2_parse_node_list') sample.pop('sentence1_binary_parse') sample.pop('sentence2_binary_parse') sample.pop('sentence1_binary_parse_tree') sample.pop('sentence2_binary_parse_tree') sample.pop('sentence1_binary_parse_node_list') sample.pop('sentence2_binary_parse_node_list') sample.pop('s1_sr_info') sample.pop('s2_sr_info') # sample.pop('s1_tree_tag') # sample.pop('s2_tree_tag') elif data_clip_method == 'no_redundancy': sample.pop('sentence1_parse') sample.pop('sentence2_parse') # sample.pop('sentence1_parse_tree') # sample.pop('sentence2_parse_tree') # sample.pop('sentence1_parse_node_list') # sample.pop('sentence2_parse_node_list') sample.pop('sentence1_binary_parse') sample.pop('sentence2_binary_parse') sample.pop('sentence1_binary_parse_tree') sample.pop('sentence2_binary_parse_tree') for node in sample['sentence1_binary_parse_node_list']: node.children_nodes = None node.leaf_node_index_seq = None for node in sample['sentence1_binary_parse_node_list']: node.children_nodes = None node.leaf_node_index_seq = None else: raise AttributeError('no data clip method named as %s' % data_clip_method) _logger.done() return data_list
def restore(self, sess): _logger.add() if cfg.load_path is not None: _logger.add('trying to restore from ckpt file %s' % cfg.load_path) try: self.saver.restore(sess, cfg.load_path) _logger.add('success to restore') except tf.errors.NotFoundError: _logger.add('failure to restore') if cfg.mode != 'train': raise FileNotFoundError('cannot find model file') else: _logger.add('No check point file') if cfg.mode != 'train': raise FileNotFoundError('cannot find model file') _logger.done()