def _build_data_pairs(fname, with_label=False): data_pairs = [] vp = preprocessing.VocabularyProcessor.restore( '../data4/vocab_processor.bin') with codecs.open(fname, 'r', 'utf-8') as rfd: for line in rfd: if len(line.strip('\n').split('\t')) != 3: continue data_id, tokens, domain = line.strip('\n').split('\t') token_list = tokens.split('|') data = [x.lower() for x in token_list if len(x.strip())] if len(data) == 0: continue sequence = np.array(list(vp.transform([('|'.join(data))]))[0]) if with_label: label = int(str(domain)) print(label) #label = dic[str(domain)] else: label = 100 pair = [data_id, label] + list(sequence) # [id, label, sequence] data_pairs.append(pair) g_log_inst.get().info( '_build_data_pairs() success, fname=%s, len(data_pairs)=%s, with_label=%s' % (fname, len(data_pairs), with_label)) return data_pairs
def encode(self, seq_list): try: _s_embedded, _s_lengths = Helper.get_batch(seq_list) feed_dict = { self.s_embedded: _s_embedded, self.s_lengths: _s_lengths} s_embeddings = self.sess.run(self.s_embeddings, feed_dict = feed_dict) return s_embeddings except Exception as e: logger.get().debug('seq_length=%s, errmsg=%s', len(seq_list), e)
def conv_fenc_u8_to_gbk(cls, in_fpath, out_fpath): try: with codecs.open(in_fpath, 'r', 'utf-8') as rfd, \ codecs.open(out_fpath, 'w', 'gbk') as wfd: # read utf8, write gbk for line in rfd: line = cls.remove_illegal_gbk_char(line) wfd.write(line) except Exception as e: logger.get().warn('errmsg=%s' % (e))
def init(cls): """Prepare required data""" myself = sys._getframe().f_code.co_name try: # Load word2vec w2v_path = config.word2vec_path cls._word2vec = cls.get_word2vec(w2v_path) return True except Exception as e: logger.get().warn('%s failed, errmsg=%s', myself, e) return False
def __init__(self): try: self.sess = tf.Session() meta_graph_def = tf.saved_model.loader.load(self.sess, ['infersent_model'], config.saved_model_path) signature = meta_graph_def.signature_def signature_def_key = 'encoder' s_embedded_name = signature[signature_def_key].inputs['s_embedded'].name s_lengths_name = signature[signature_def_key].inputs['s_lengths'].name s_embeddings_name = signature[signature_def_key].outputs['s_embeddings'].name self.s_embedded = self.sess.graph.get_tensor_by_name(s_embedded_name) self.s_lengths = self.sess.graph.get_tensor_by_name(s_lengths_name) self.s_embeddings = self.sess.graph.get_tensor_by_name(s_embeddings_name) logger.get().info('init sentence encoder success') except Exception as e: logger.get().warn('init sentence encoder failed, errmsg=%s', e)
def _normalize_token(cls, token): token = token.lower() try: # 11 usually means phone number if len(token) != 11 and token.isdigit(): token = 'int_t' for k, v in cls._replace_pattern_cfg.items(): if v.match(token): token = k break if '{[' not in token: return token for item in cls._wordseg_pattern_cfg: token = item.sub('', token) return token except Exception as e: logger.get().warn('token=%s, errmsg=%s' % (token, e)) return token
def stat_token_freq(cls, in_fpath, out_fpath): stop_words = conf.g_stop_words_cfg try: word_counter = Counter() with codecs.open(in_fpath, 'r', 'utf-8') as rfd: for line in rfd: raw_str, word_seg = line.strip('\n').split('\t') tokens = word_seg.split() tokens = filter(lambda x: x not in stop_words, tokens) tokens = map(cls._normalize_token, tokens) for t in tokens: if ('{[' not in t) and len(t) <= cls._valid_token_len: word_counter[t] += 1 else: logger.get().warn('invalid token, token=%s' % (t)) # tokenize via jieba for n_t in jieba.cut(t): word_counter[n_t] += 1 logger.get().debug('jieba cut, token=%s' % (n_t)) # dump word_counter sorted_words = sorted(word_counter.keys(), key=lambda k: word_counter[k], reverse=True) with codecs.open(out_fpath, 'w', 'utf-8') as wfd: for word in sorted_words: tmp = '%s\t%s\n' % (word, word_counter[word]) wfd.write(tmp) except Exception as e: logger.get().warn('errmsg=%s' % (e))
def save_chat(who_send,msg): g_log_inst.get().debug("{}:{}".format(who_send,msg.encode("utf-8")))
def get_word2vec(cls, w2v_fpath): wv = models.KeyedVectors.load_word2vec_format(w2v_fpath, binary = False) logger.get().info('load word2vec success, vector length: %s', (wv['<s>'].shape[0])) return wv
def main(_): if not FLAGS.data_path: g_log_inst.get().error( 'Must set --data_path to training data directory') return ckpt_dir = '../model/training-ckpt/' + FLAGS.model if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) config = get_config() eval_config = get_config() eval_config.batch_size = 1 [train_data, valid_data, test_data], id2word_dict = reader.load_train_data(FLAGS.data_path) g_log_inst.get().info('bilstm-attention training begin') with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope(FLAGS.model, reuse=None, initializer=initializer): m = BiLSTM_Attention_Model(is_training=True, config=config) with tf.variable_scope(FLAGS.model, reuse=True, initializer=initializer): mtest = BiLSTM_Attention_Model(is_training=False, config=eval_config) tf.initialize_all_variables().run() # add ops to save and restore all the variables. saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(session, ckpt.model_checkpoint_path) g_log_inst.get().info('[model] restore success, ckpt_path=%s' % (ckpt.model_checkpoint_path)) save_path = saver.save(session, '%s/model.ckpt' % (ckpt_dir)) else: pre_valid_perplexity = float("inf") learning_rate = config.learning_rate start_decay = False for i in range(config.max_epoch): if start_decay == True: learning_rate *= config.lr_decay m.assign_lr(session, learning_rate) g_log_inst.get().info('Epoch: %d Learning rate: %.3f' % (i + 1, session.run(m.lr))) # shuffle the data before mini-batch training random.shuffle(train_data) # train train_perplexity, accuracy, _ = run_epoch(session, m, train_data, m.train_op, verbose=True) g_log_inst.get().info( 'Epoch: %d Train Perplexity: %.3f accuracy: %s' % (i + 1, train_perplexity, accuracy)) # valid valid_perplexity, valid_accuracy, _ = run_epoch( session, mtest, valid_data, tf.no_op()) g_log_inst.get().info( 'Epoch: %d Valid Perplexity: %.3f accuracy: %s' % (i + 1, valid_perplexity, valid_accuracy)) # if valid set perplexity improves too small, start lr decay if pre_valid_perplexity - valid_perplexity < config.perplexity_thres and start_decay: start_decay = False g_log_inst.get().info( 'Valid Perplexity increases too small, start lr decay') if pre_valid_perplexity < valid_perplexity: # current epoch is rejected ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(session, ckpt.model_checkpoint_path) g_log_inst.get().info( '[model] restore success, ckpt_path=%s' % (ckpt.model_checkpoint_path)) if learning_rate == config.learning_rate: # if has not decay yet, give a second chance continue else: # stop training g_log_inst.get().info( 'Valid Perplexity does not increase, stop training' ) break pre_valid_perplexity = valid_perplexity # save the variables to disk. save_path = saver.save(session, '%s/model.ckpt' % (ckpt_dir)) g_log_inst.get().info('[model] save success, ckpt_path=%s' % (save_path)) # test the accuracy test_perplexity, accuracy, domain_accuracy = run_epoch( session, mtest, test_data, tf.no_op(), debug=True, verbose=True, id2word_dict=id2word_dict, dsl_converter=config.converter) g_log_inst.get().info('Test: perplexity=%.3f, accuracy=%s' % (test_perplexity, accuracy)) # acc compute ''' for idx, domain_accu in enumerate(domain_accuracy): g_log_inst.get().info('Domain: %s, precision: %.3f, recall: %.3f' % ( config.converter.label2domain[idx], domain_accuracy[idx][0] / float(domain_accuracy[idx][1]), domain_accuracy[idx][2] / float(domain_accuracy[idx][3]))) ''' g_log_inst.get().info('bilstm_attention training finished')
def run_epoch(session, model, data, eval_op, debug=False, verbose=False, id2word_dict=None, dsl_converter=None): '''Runs the model on the given data.''' epoch_size = len(data) // model.batch_size # statistic start_time = time.time() costs = 0.0 iters = 0 accuracy_sum = 0.0 domain_accuracy = [ [0, 0, 0, 0] for i in range(model.num_labels) ] # [pred1, pre2d, rec1, rec2], calculate the precision and recall at the same time state = session.run(model.initial_state) if debug: wrong_predict_out = codecs.open('../log/wrong_pred.txt', 'w', 'utf-8') y_label = [] y_pred = [] for step, (data_ids, sequences, labels, seq_lens) in enumerate( reader.pairs_iterator(data, model.batch_size, model.num_steps)): #print sequences feed_dict = {} feed_dict[model.input_data] = sequences feed_dict[model.targets] = labels feed_dict[model.sequence_length] = seq_lens if debug: fetches = [model.cost, model.predicts, eval_op] cost, predicts, _ = session.run(fetches, feed_dict) x_str = '_'.join(map(lambda x: str(x), sequences[0])) domain_predict = predicts[-1] domain_label = labels[0][0] y_label.append(dsl_converter.label2domain[int(domain_label)]) y_pred.append(dsl_converter.label2domain[int(domain_predict)]) domain_accuracy[domain_predict][1] += 1 # denominator of precision domain_accuracy[int(domain_label)][3] += 1 # denominator of recall accuracy = 1 if str(domain_predict) == str(domain_label) else 0 # if right, update the pred and rec if accuracy == 1: domain_accuracy[int( domain_label)][0] += 1 # nominator of precision domain_accuracy[domain_predict][2] += 1 # nominator of recall else: raw_str = '|'.join( [str(i) for i in sequences[0][0:seq_lens[0]]]) #wrong_predict_out.write('data_id=%s, label=%s, predict=%s, raw_str=%s\n' % ( #data_ids, dsl_converter.label2domain[int(domain_label)], dsl_converter.label2domain[domain_predict], #raw_str)) wrong_predict_out.write( '%s\t%s\t%s\n' % (str(data_ids).replace("'", '').replace('[', '').replace( ']', ''), str(domain_label), str(domain_predict))) g_log_inst.get().debug( 'step=%s, cost=%s, x_str=%s, predict=%s, label_idx=%s' % (step, cost, x_str, domain_predict, domain_label)) g_log_inst.get().debug('predicts=%s' % (predicts)) else: fetches = [model.cost, model.accuracy, eval_op] cost, accuracy, _ = session.run(fetches, feed_dict) costs += cost iters += model.num_steps accuracy_sum += accuracy avg_accuracy = accuracy_sum / (step + 1) if verbose and step % (epoch_size // 10) == 10: g_log_inst.get().info( '%.3f perplexity: %.3f speed: %.0f wps, accuracy=%.03f' % (step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time), avg_accuracy)) if debug: y_true = y_label classify_report = metrics.classification_report(y_true, y_pred) confusion_matrix = metrics.confusion_matrix(y_true, y_pred) overall_accuracy = metrics.accuracy_score(y_true, y_pred) acc_for_each_class = metrics.precision_score(y_true, y_pred, average=None) average_accuracy = np.mean(acc_for_each_class) score = metrics.accuracy_score(y_true, y_pred) print('classify_report : \n', classify_report) print('average_accuracy: {0:f}'.format(average_accuracy)) print('overall_accuracy: {0:f}'.format(overall_accuracy)) print('score: {0:f}'.format(score)) if debug: wrong_predict_out.close() perplexity = np.exp(costs / iters) return (perplexity, avg_accuracy, domain_accuracy)
def build_tree(ruleset, ruleset_text): # basic matrics max_depth = 0 max_leaf_depth = 0 total_leaf_number = 0 total_leaf_depth = 0 total_mem_size = 0 # node format: [tree-depth, parent-bit-array, msg] node_stack = [] node_stack.append([0, [], ruleset, '']) # root node while len(node_stack): curr_depth, parent_bit_array, curr_ruleset, curr_msg = node_stack.pop() if curr_depth > max_depth: max_depth = curr_depth avaliable_bit_array = list( set(range(BIT_LENGTH)) - set(parent_bit_array)) if curr_depth == 0: verbose = True else: verbose = False bit_array, further_separable, split_info = bit_select( curr_ruleset, avaliable_bit_array, verbose=verbose) # if current non-leaf node cannnot be further splitted, turn it into # leaf node if not len(bit_array): g_log_inst.get().debug("change current node to leaf node") #for j, r in enumerate(curr_ruleset): # if j == 0: # result_file.write('\t' * level + new_msg + str(i) + ': ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n') # else: # result_file.write('\t' * level + len(new_msg + str(i) + ': ') * ' ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n') total_leaf_number += 1 total_leaf_depth += curr_depth + len(curr_ruleset) if max_leaf_depth < curr_depth + len(curr_ruleset): max_leaf_depth = curr_depth + len(curr_ruleset) # append memory cost for storing the rules total_mem_size += len(curr_ruleset) * LINEAR_BUCKET_SIZE continue buckets, max_bucket_size, max_bucket_num, bucket_percentage_stat = \ split_info if curr_depth == 0: #result_file.write("Basic bit array: %r\n\n" % bit_array) new_msg = curr_msg else: new_msg = curr_msg + str(bit_array) + '-' bit_array = bit_array + parent_bit_array g_log_inst.get().debug("Current length %d bit_array: %r" % (len(bit_array), bit_array)) # If rule-num of every bucket is no more than BINTH, it means every # bucket will become leaf node. Then this level will be regarded as a # bottom level if max_bucket_size <= BINTH: bottom_level = True else: bottom_level = False # next level for idx, subset in enumerate(buckets): # Non-leaf node if len(subset) > BINTH and further_separable == True: total_mem_size += NON_LEAF_BUCKET_STRUCTURE_SIZE #result_file.write('\t' * curr_depth + new_msg + str(idx) # + ': \n') node_stack.append([curr_depth + 1, bit_array, subset, new_msg]) # Leaf node else: #if subset: # for j, r in enumerate(subset): # if j == 0: # result_file.write('\t' * level + new_msg + str(i) + ': ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n') # else: # result_file.write('\t' * level + len(new_msg + str(i) + ': ') * ' ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n') total_leaf_number += 1 total_leaf_depth += curr_depth + 2 + len(subset) if max_leaf_depth < curr_depth + 2 + len(subset): max_leaf_depth = curr_depth + 2 + len(subset) if bottom_level == True: total_mem_size += LEAF_BUCKET_STRUCTURE_SIZE + len(subset) \ * LINEAR_BUCKET_SIZE else: total_mem_size += NON_LEAF_BUCKET_STRUCTURE_SIZE + \ len(subset) * LINEAR_BUCKET_SIZE g_log_inst.get().debug("current node split finished (depth: %d)" % curr_depth) return max_depth, max_leaf_depth, total_leaf_number, total_leaf_depth, \ total_mem_size
def bit_select(ruleset, avaliable_bit_array, max_bit_array_length=float('inf'), use_spfac=True, verbose=False): # format: {bit: pair_dict}. Here pair_dict is the dictionary of rule # pairs. All the pairs this bit can separate are set to 1 bit_pair_dict = {} bit_pair_size = {} # format: {bit: pair_size} pair_num = 0 # get pair info for bit in avaliable_bit_array: bit_pair_dict[bit] = bit_separation_info(ruleset, bit) bit_pair_size[bit] = pair_count(bit_pair_dict[bit]) if verbose: g_log_inst.get().debug("bit %d : %d" % (bit, bit_pair_size[bit])) pair_num += bit_pair_size[bit] origin_rule_num = len(ruleset) max_bucket_size = origin_rule_num max_bucket_num = 1 if pair_num == 0: g_log_inst.get().debug("No single bit can be selected to split") return [], False, [] # select cutting bits bit_array = [] further_separable = True while max_bucket_size > 1: # select the best bit in terms of "separability": sorted_bit_pair_size = sorted(bit_pair_size.items(), key=lambda x: x[1], reverse=True) # to prevent to be stucked if sorted_bit_pair_size[0][1] == 0: g_log_inst.get().debug("Cannot continue to split by single bit") further_separable = False break bit_selected = sorted_bit_pair_size[0][0] bit_array.append(bit_selected) # update the pair-dict for bit, bit_pair in bit_pair_dict.iteritems(): if bit != bit_selected: pair_dict_sub(bit_pair_dict[bit], bit_pair_dict[bit_selected]) bit_pair_size[bit] = pair_count(bit_pair_dict[bit]) del bit_pair_dict[bit_selected] del bit_pair_size[bit_selected] buckets, max_bucket_size, max_bucket_num, bucket_percentage_stat = \ split_ruleset(ruleset, bit_array) g_log_inst.get().debug("add bit %d" % bit_selected) g_log_inst.get().debug("max_bucket_size %d, max_bucket_num %d" % (max_bucket_size, max_bucket_num)) # Spfac calculate children_rule_num = 0 children_node_num = 2**len(bit_array) for (k, v) in bucket_percentage_stat.items(): children_rule_num += k * v * children_node_num Spfac = (children_rule_num + children_node_num) / float(origin_rule_num) # Stopping criteria if len(bit_array) >= max_bit_array_length: break if use_spfac and Spfac > SPFAC: break split_info = (buckets, max_bucket_size, max_bucket_num, bucket_percentage_stat) return bit_array, further_separable, split_info