def digitize_dataset(self, data_list, dicts, data_type): token2index = dict([ (token, idx) for idx, token in enumerate(dicts['token'] + dicts['glove']) ]) char2index = dict([(token, idx) for idx, token in enumerate(dicts['char'])]) def digitize_token(token): token = token if not cfg.lower_word else token.lower() try: return token2index[token] except KeyError: return 1 def digitize_char(char): try: return char2index[char] except KeyError: return 1 _logger.add() _logger.add('digitizing data: %s...' % data_type) for sample in data_list: for tree_node in sample: tree_node['token_seq_digital'] = [ digitize_token(token) for token in tree_node['token_seq'] ] tree_node['char_seq_digital'] = [[ digitize_char(char) for char in char_seq ] for char_seq in tree_node['char_seq']] _logger.done() return data_list
def filter_data(self, only_sent=False, fine_grained=False): _logger.add() _logger.add('filtering data for %s, only sentence: %s' % (self.data_type, only_sent)) if only_sent: counter = 0 new_nn_data = [] for trees in self.nn_data: new_trees = [] for tree in trees: if tree['is_sent']: new_trees.append(tree) counter += 1 new_nn_data.append(new_trees) self.nn_data = new_nn_data self.sample_num = counter if not fine_grained: # delete the neutral sample counter = 0 new_nn_data = [] for trees in self.nn_data: new_trees = [] for tree in trees: sent_label = tree['root_node']['sentiment_label'] if sent_label <= 0.4 or sent_label > 0.6: counter += 1 new_trees.append(tree) new_nn_data.append(new_trees) self.nn_data = new_nn_data self.sample_num = counter
def gene_sub_trees_and_shift_reduce_info(self): _logger.add() _logger.add('generating sub-trees and shift reduce info: %s...' % self.data_type) counter = 0 new_data_list = [] for tree in self.digitized_data_list: sub_trees = [] idx_to_node_dict = dict( (tree_node['node_index'], tree_node) for tree_node in tree) for tree_node in tree: # get all node for a sub tree if tree_node['is_leaf']: new_sub_tree = [tree_node] else: new_sub_tree = [] new_sub_tree_leaves = [ idx_to_node_dict[node_index] for node_index in tree_node['leaf_node_index_seq'] ] new_sub_tree += new_sub_tree_leaves for leaf_node in new_sub_tree_leaves: pre_node = leaf_node while pre_node[ 'parent_index'] > 0 and pre_node != tree_node: # fixme cur_node = idx_to_node_dict[ pre_node['parent_index']] if cur_node not in new_sub_tree: new_sub_tree.append(cur_node) pre_node = cur_node # get shift reduce info child_node_indices = [ new_tree_node['node_index'] for new_tree_node in new_sub_tree ] parent_node_indices = [ new_tree_node['parent_index'] if new_tree_node['parent_index'] in child_node_indices else 0 for new_tree_node in new_sub_tree ] sr_result = shift_reduce_constituency_forest( list(zip(child_node_indices, parent_node_indices))) operation_list, node_list_in_stack, reduce_mat = zip( *sr_result) shift_reduce_info = { 'op_list': operation_list, 'reduce_mat': reduce_mat, 'node_list_in_stack': node_list_in_stack } sub_tree = { 'tree_nodes': new_sub_tree, 'shift_reduce_info': shift_reduce_info, 'root_node': tree_node, 'is_sent': True if tree_node['parent_index'] == 0 else False } sub_trees.append(sub_tree) counter += 1 new_data_list.append(sub_trees) return new_data_list, counter
def __init__(self, data_list, data_type, dicts=None): self.data_type = data_type _logger.add('building data set object for %s' % data_type) assert data_type in ['train', 'dev', 'test'] # check if data_type in ['dev', 'test']: assert dicts is not None processed_data_list = self.process_raw_data(data_list, data_type) if data_type == 'train': self.dicts, self.max_lens = self.count_data_and_build_dict( processed_data_list) else: _, self.max_lens = self.count_data_and_build_dict( processed_data_list, False) self.dicts = dicts self.digitized_data_list = self.digitize_dataset( processed_data_list, self.dicts, data_type) self.nn_data, self.sample_num = self.gene_sub_trees_and_shift_reduce_info( ) self.emb_mat_token, self.emb_mat_glove = None, None if data_type == 'train': self.emb_mat_token, self.emb_mat_glove = self.generate_index2vec_matrix( )
def build_loss(self): """ Build Loss function :return: Loss """ # weight_decay with tf.name_scope("weight_decay"): for var in set(tf.get_collection('reg_vars', self.scope)): weight_decay = tf.multiply(tf.nn.l2_loss(var), cfg.wd, name="{}-wd".format('-'.join( str(var.op.name).split('/')))) tf.add_to_collection('losses', weight_decay) reg_vars = tf.get_collection('losses', self.scope) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) _logger.add('regularization var num: %d' % len(reg_vars)) _logger.add('trainable var num: %d' % len(trainable_vars)) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.batch_output_labels, logits=self.logits) tf.add_to_collection('losses', tf.reduce_mean(losses, name='xentropy_loss_mean')) loss = tf.add_n(tf.get_collection('losses', self.scope), name='loss') tf.summary.scalar(loss.op.name, loss) tf.add_to_collection('ema/scalar', loss) return loss
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) with tf.variable_scope('emb'): # get the embedding matrix for query by tf.nn.embedding_lookup token_emb_mat = generate_embedding_mat( self.token_set_len, self.word_embedding_len, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, # is false scope='gene_token_emb_mat') # lookup from token_ids to embeddings emb = tf.nn.embedding_lookup( token_emb_mat, self.token_seq ) # batch_size,max_sequence_length,word_embedding_len # here emb can me changed for whatever in theory self.tensor_dict['emb'] = emb rep = disan(emb, self.token_mask, 'DiSAN', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='') with tf.variable_scope('output'): pre_logits = tf.nn.relu( linear([rep], self.hidden_units_no, bias=True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # batch_size, hidden_units logits = linear( [pre_logits], self.output_class_count, bias=False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # batch_size, 5 (output_classes) _logger.done() return logits
def output_model_params(): _logger.add() _logger.add('==>model_title: ' + cfg.model_name[1:]) _logger.add() for key, value in cfg.args.__dict__.items(): if key not in ['test', 'shuffle']: _logger.add('%s: %s' % (key, value))
def build_network(self): """ Build ADiSAN + Fully-Connected NN architecture, :return: Reference to FCNN output. """ _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) with tf.variable_scope('emb'): # get the embedding matrix emb = self.batch_embedding_sequence # here emb can me changed for whatever in theory self.tensor_dict['emb'] = emb rep = disan(emb, self.batch_access_mask, 'DiSAN', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='') with tf.variable_scope('output'): pre_logits = tf.nn.relu( linear([rep], self.hidden_units_no, bias=True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # batch_size, hidden_units logits = linear( [pre_logits], self.output_class_count, bias=False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # batch_size, output_class_count _logger.done() return logits
def load_nn_model(modelFilePath): _logger.add() _logger.add('try to load model file %s' % modelFilePath) allParamValues = None epoch = 1 isLoaded = False if os.path.isfile(modelFilePath): _logger.add('Have found model file, loading...') with open(modelFilePath, 'rb') as f: data = pickle.load(f) allParamValues = data[0] epoch = data[1] isLoaded = True else: _logger.add('Have not found model file') _logger.add('Done') return isLoaded, allParamValues, epoch
def process_raw_data(self, data_list, data_type): """ Internal Use :param data_list: :param data_type: :return: """ _logger.add() _logger.add('processing raw data: %s...' % data_type) for sample in data_list: for tree_node in sample: # node_index, parent_index, token_seq, leaf_node_index_seq, is_leaf, token, sentiment_label # char_seq tree_node['char_seq'] = [ list(token) for token in tree_node['token_seq'] ] _logger.done() return data_list
def load_glove(dim): _logger.add() _logger.add('loading glove from pre-trained file...') # if dim not in [50, 100, 200, 300]: # raise(ValueError, 'glove dim must be in [50, 100, 200, 300]') word2vec = {} with open(os.path.join(cfg.glove_dir, "glove.%s.%sd.txt" % (cfg.glove_corpus, str(dim))), encoding='utf-8') as f: for line in f: l = None try: l = line.strip(os.linesep).split(' ') vector = np.array(list(map(float, l[1:])), dtype=cfg.floatX) word2vec[l[0]] = vector assert vector.shape[0] == dim #print('right:', l) except ValueError: print('1.token_error-line:', line[:-1]) print('2.token_error-split:', l) anchor = 0 except AssertionError: print('1.vec_error-line:', line[:-1]) print('2.vec_error-split:', l) _logger.add('Done') return word2vec
def save_nn_model(modelFilePath, allParams, epoch): _logger.add() _logger.add('saving model file to %s' % modelFilePath) with open(modelFilePath, 'wb') as f: pickle.dump(obj=[[param.get_value() for param in allParams], epoch], file=f) _logger.add('Done')
def generate_index2vec_matrix(self): _logger.add() _logger.add('generate index to vector numpy matrix') token2vec = load_glove(cfg.word_embedding_length) if cfg.lower_word: newToken2vec = {} for token, vec in token2vec.items(): newToken2vec[token.lower()] = vec token2vec = newToken2vec # prepare data from trainDataset and devDataset mat_token = np.random.normal(0, 1, size=(len(self.dicts['token']), cfg.word_embedding_length)).astype( cfg.floatX) mat_glove = np.zeros( (len(self.dicts['glove']), cfg.word_embedding_length), dtype=cfg.floatX) for idx, token in enumerate(self.dicts['token']): try: mat_token[idx] = token2vec[token] except KeyError: pass mat_token[0] = np.zeros(shape=(cfg.word_embedding_length, ), dtype=cfg.floatX) for idx, token in enumerate(self.dicts['glove']): mat_glove[idx] = token2vec[token] _logger.add('Done') return mat_token, mat_glove
def load_file(filePath, dataName='data', mode='pickle'): _logger.add() _logger.add('Trying to load %s from %s' % (dataName, filePath)) data = None ifLoad = False if os.path.isfile(filePath): _logger.add('Have found the file, loading...') if mode == 'pickle': with open(filePath, 'rb') as f: data = pickle.load(f) ifLoad = True elif mode == 'json': with open(filePath, 'r', encoding='utf-8') as f: data = json.load(f) ifLoad = True else: raise (ValueError, 'Function save_file does not have mode %s' % (mode)) else: _logger.add('Have not found the file') _logger.add('Done') return (ifLoad, data)
def save_file(data, filePath, dataName='data', mode='pickle'): _logger.add() _logger.add('Saving %s to %s' % (dataName, filePath)) if mode == 'pickle': with open(filePath, 'wb') as f: pickle.dump(obj=data, file=f) elif mode == 'json': with open(filePath, 'w', encoding='utf-8') as f: json.dump(obj=data, fp=f) else: raise (ValueError, 'Function save_file does not have mode %s' % (mode)) _logger.add('Done')
def get_evaluation_file_output(self, sess, dataset_obj, global_step, deleted_step): _logger.add() _logger.add('get evaluation file output for %s' % dataset_obj.data_type) # delete old file if deleted_step is not None: delete_name = 'gs_%d' % deleted_step delete_path = os.path.join(self.answer_dir, delete_name) if os.path.exists(delete_path): shutil.rmtree(delete_path) _logger.add() _logger.add('getting evaluation result for %s' % dataset_obj.data_type) logits_list, loss_list, accu_list = [], [], [] is_sent_list = [] for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter(): feed_dict = self.model.get_feed_dict(sample_batch, 'dev') logits, loss, accu = sess.run( [self.model.logits, self.model.loss, self.model.accuracy], feed_dict) logits_list.append(np.argmax(logits, -1)) loss_list.append(loss) accu_list.append(accu) is_sent_list += [sample['is_sent'] for sample in sample_batch] logits_array = np.concatenate(logits_list, 0) loss_value = np.mean(loss_list) accu_array = np.concatenate(accu_list, 0) accu_value = np.mean(accu_array) sent_accu_list = [] for idx, is_sent in enumerate(is_sent_list): if is_sent: sent_accu_list.append(accu_array[idx]) sent_accu_value = np.mean(sent_accu_list) # analysis analysis_save_dir = self.mkdir(self.answer_dir, 'gs_%s' % global_step or 'test') OutputAnalysis.do_analysis(dataset_obj, logits_array, accu_array, analysis_save_dir, self.fine_grained)
def trigger_evaluation(model, train_data_obj, test_data_obj, dev_data_obj): """ Legacy code to trigger the training on the given model. :param model: ModelDiSAN instance :param dev_data_obj: Object with data_type attr and generate_batch_sample_iter() method :param test_data_obj: Object with data_type attr and generate_batch_sample_iter() method. :param train_data_obj: Object with data_type attr and generate_batch_sample_iter() method. :return: """ graphHandler = GraphHandler(model) evaluator = Evaluator(model) if cfg.gpu_mem < 1.: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True) else: gpu_options = tf.GPUOptions() graph_config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) # graph_config.gpu_options.allow_growth = True sess = tf.Session(config=graph_config) graphHandler.initialize(sess) # ---- dev ---- dev_loss, dev_accu, dev_sent_accu = evaluator.get_evaluation( sess, dev_data_obj, 1) _logger.add( '==> for dev, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' % (dev_loss, dev_accu, dev_sent_accu)) # ---- test ---- test_loss, test_accu, test_sent_accu = evaluator.get_evaluation( sess, test_data_obj, 1) _logger.add( '~~> for test, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' % (test_loss, test_accu, test_sent_accu)) # ---- train ---- train_loss, train_accu, train_sent_accu = evaluator.get_evaluation( sess, train_data_obj, 1) _logger.add( '--> for test, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' % (train_loss, train_accu, train_sent_accu))
def trigger_training(model, train_data_obj, test_data_obj, dev_data_obj): """ Legacy code to trigger the training on the given model. :param model: ModelDiSAN instance :param dev_data_obj: Object with data_type attr and generate_batch_sample_iter() method :param test_data_obj: Object with data_type attr and generate_batch_sample_iter() method. :param train_data_obj: Object with sample_num attr and generate_batch_sample_iter(num_steps) method :return: """ graphHandler = GraphHandler(model) evaluator = Evaluator(model) performRecoder = PerformRecoder(3) if cfg.gpu_mem < 1.: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True) else: gpu_options = tf.GPUOptions() graph_config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) sess = tf.Session(config=graph_config) graphHandler.initialize(sess) # begin training steps_per_epoch = int( math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size)) num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch global_step = 0 #todo: replace following method call for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter( num_steps): global_step = sess.run(model.global_step) + 1 if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0 loss, summary, train_op = model.step(sess, sample_batch, get_summary=if_get_summary) if global_step % 100 == 0: _logger.add('data round: %d: %d/%d, global step:%d -- loss: %.4f' % (data_round, idx_b, batch_num, global_step, loss)) if if_get_summary: graphHandler.add_summary(summary, global_step) # Occasional evaluation if global_step % (cfg.eval_period or steps_per_epoch) == 0: # ---- dev ---- dev_loss, dev_accu, dev_sent_accu = evaluator.get_evaluation( sess, dev_data_obj, global_step) _logger.add( '==> for dev, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' % (dev_loss, dev_accu, dev_sent_accu)) # ---- test ---- test_loss, test_accu, test_sent_accu = evaluator.get_evaluation( sess, test_data_obj, global_step) _logger.add( '~~> for test, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' % (test_loss, test_accu, test_sent_accu)) is_in_top, deleted_step = performRecoder.update_top_list( global_step, dev_accu, sess) if is_in_top and global_step > 30000: # todo-ed: delete me to run normally # evaluator.get_evaluation_file_output(sess, dev_data_obj, global_step, deleted_step) evaluator.get_evaluation_file_output(sess, test_data_obj, global_step, deleted_step) this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round( data_round) if this_epoch_time is not None and mean_epoch_time is not None: _logger.add('##> this epoch time: %f, mean epoch time: %f' % (this_epoch_time, mean_epoch_time))
def restore(self, sess): _logger.add() # print(self.ckpt_dir) if self.load_step is None: if self.load_path is None: _logger.add('trying to restore from dir %s' % self.ckpt_dir) latest_checkpoint_path = tf.train.latest_checkpoint(self.ckpt_dir) else: latest_checkpoint_path = self.load_path else: latest_checkpoint_path = self.ckpt_path + '-' + str(self.load_step) if latest_checkpoint_path is not None: _logger.add('trying to restore from ckpt file %s' % latest_checkpoint_path) try: self.saver.restore(sess, latest_checkpoint_path) _logger.add('success to restore') except tf.errors.NotFoundError: _logger.add('failure to restore') if self.mode != 'train': raise FileNotFoundError('canot find model file') else: _logger.add('No check point file in dir %s ' % self.ckpt_dir) if self.mode != 'train': raise FileNotFoundError('canot find model file') _logger.done()
def save(self, sess, global_step=None): _logger.add() _logger.add('saving model to %s' % self.ckpt_path) self.saver.save(sess, self.ckpt_path, global_step) _logger.done()
def add_summary(self, summary, global_step): _logger.add() _logger.add('saving summary...') self.writer.add_summary(summary, global_step) _logger.done()
def count_data_and_build_dict(self, data_list, gene_dicts=True): def add_ept_and_unk(a_list): a_list.insert(0, '@@@empty') a_list.insert(1, '@@@unk') return a_list _logger.add() _logger.add('counting and build dictionaries') token_collection = [] char_collection = [] sent_len_collection = [] token_len_collection = [] for sample in data_list: for tree_node in sample: token_collection += tree_node['token_seq'] sent_len_collection.append(len(tree_node['token_seq'])) for char_seq in tree_node['char_seq']: char_collection += char_seq token_len_collection.append(len(char_seq)) max_sent_len = dynamic_length(sent_len_collection, 1, security=False)[0] max_token_len = dynamic_length(token_len_collection, 0.99, security=False)[0] if gene_dicts: tokenSet = dynamic_keep(token_collection, 1) charSet = dynamic_keep(char_collection, 1) if cfg.use_glove_unk_token: gloveData = load_glove(cfg.word_embedding_length) gloveTokenSet = list(gloveData.keys()) if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) ##!!! gloveTokenSet = list( set([token.lower() for token in gloveTokenSet])) ##!!! # delete token from gloveTokenSet which appears in tokenSet for token in tokenSet: try: gloveTokenSet.remove(token) except ValueError: pass else: if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) gloveTokenSet = [] tokenSet = add_ept_and_unk(tokenSet) charSet = add_ept_and_unk(charSet) dicts = { 'token': tokenSet, 'char': charSet, 'glove': gloveTokenSet } else: dicts = {} _logger.done() return dicts, {'sent': max_sent_len, 'token': max_token_len}
def get_evaluation(self, sess, dataset_obj: DataManager, global_step=None): """ todo: hacer datasetobj un datamanager :param sess: :param dataset_obj: :param global_step: :return: """ _logger.add() _logger.add('getting evaluation result for %s' % dataset_obj.data_type) logits_list, loss_list, accu_list = [], [], [] is_sent_list = [] for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter(): feed_dict = self.model.get_feed_dict(sample_batch, 'dev') logits, loss, accu = sess.run( [self.model.logits, self.model.loss, self.model.accuracy], feed_dict) logits_list.append(np.argmax(logits, -1)) loss_list.append(loss) accu_list.append(accu) is_sent_list += [sample['is_sent'] for sample in sample_batch] logits_array = np.concatenate(logits_list, 0) loss_value = np.mean(loss_list) accu_array = np.concatenate(accu_list, 0) accu_value = np.mean(accu_array) sent_accu_list = [] for idx, is_sent in enumerate(is_sent_list): if is_sent: sent_accu_list.append(accu_array[idx]) sent_accu_value = np.mean(sent_accu_list) # analysis # analysis_save_dir = self.mkdir(self.answer_dir,'gs_%s'%global_step or 'test') # OutputAnalysis.do_analysis(dataset_obj, logits_array, accu_array, analysis_save_dir, # self.fine_grained) # add summary if global_step is not None: if dataset_obj.data_type == 'train': summary_feed_dict = { self.train_loss: loss_value, self.train_accuracy: accu_value, self.train_sent_accuracy: sent_accu_value, } summary = sess.run(self.train_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) elif dataset_obj.data_type == 'dev': summary_feed_dict = { self.dev_loss: loss_value, self.dev_accuracy: accu_value, self.dev_sent_accuracy: sent_accu_value, } summary = sess.run(self.dev_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) else: summary_feed_dict = { self.test_loss: loss_value, self.test_accuracy: accu_value, self.test_sent_accuracy: sent_accu_value, } summary = sess.run(self.test_summaries, summary_feed_dict) self.writer.add_summary(summary, global_step) return loss_value, accu_value, sent_accu_value