Ejemplo n.º 1
0
def _build_data_pairs(fname, with_label=False):
    data_pairs = []
    vp = preprocessing.VocabularyProcessor.restore(
        '../data4/vocab_processor.bin')
    with codecs.open(fname, 'r', 'utf-8') as rfd:
        for line in rfd:
            if len(line.strip('\n').split('\t')) != 3:
                continue
            data_id, tokens, domain = line.strip('\n').split('\t')
            token_list = tokens.split('|')
            data = [x.lower() for x in token_list if len(x.strip())]
            if len(data) == 0:
                continue
            sequence = np.array(list(vp.transform([('|'.join(data))]))[0])
            if with_label:
                label = int(str(domain))
                print(label)
                #label = dic[str(domain)]
            else:
                label = 100
            pair = [data_id, label] + list(sequence)  # [id, label, sequence]
            data_pairs.append(pair)
    g_log_inst.get().info(
        '_build_data_pairs() success, fname=%s, len(data_pairs)=%s, with_label=%s'
        % (fname, len(data_pairs), with_label))
    return data_pairs
Ejemplo n.º 2
0
	def encode(self, seq_list):
		try:
			_s_embedded, _s_lengths = Helper.get_batch(seq_list)		
			feed_dict = {
				self.s_embedded: _s_embedded,
				self.s_lengths: _s_lengths}
			s_embeddings = self.sess.run(self.s_embeddings, feed_dict = feed_dict)
			return s_embeddings
		except Exception as e:
			logger.get().debug('seq_length=%s, errmsg=%s', len(seq_list), e)
 def conv_fenc_u8_to_gbk(cls, in_fpath, out_fpath):
     try:
         with codecs.open(in_fpath, 'r', 'utf-8') as rfd, \
             codecs.open(out_fpath, 'w', 'gbk') as wfd:
             # read utf8, write gbk
             for line in rfd:
                 line = cls.remove_illegal_gbk_char(line)
                 wfd.write(line)
     except Exception as e:
         logger.get().warn('errmsg=%s' % (e))
 def init(cls):
     """Prepare required data"""
     myself = sys._getframe().f_code.co_name
     try:
         # Load word2vec
         w2v_path = config.word2vec_path
         cls._word2vec = cls.get_word2vec(w2v_path)
         return True
     except Exception as e:
         logger.get().warn('%s failed, errmsg=%s', myself, e)
         return False
Ejemplo n.º 5
0
	def __init__(self):
		try:
			self.sess = tf.Session()
			meta_graph_def = tf.saved_model.loader.load(self.sess, 
				['infersent_model'], config.saved_model_path)
			signature = meta_graph_def.signature_def
			signature_def_key = 'encoder'
			s_embedded_name = signature[signature_def_key].inputs['s_embedded'].name
			s_lengths_name = signature[signature_def_key].inputs['s_lengths'].name
			s_embeddings_name = signature[signature_def_key].outputs['s_embeddings'].name
			self.s_embedded = self.sess.graph.get_tensor_by_name(s_embedded_name)
			self.s_lengths = self.sess.graph.get_tensor_by_name(s_lengths_name)
			self.s_embeddings = self.sess.graph.get_tensor_by_name(s_embeddings_name)
			logger.get().info('init sentence encoder success')
		except Exception as e:
			logger.get().warn('init sentence encoder failed, errmsg=%s', e)
 def _normalize_token(cls, token):
     token = token.lower()
     try:
         # 11 usually means phone number
         if len(token) != 11 and token.isdigit():
             token = 'int_t'
         for k, v in cls._replace_pattern_cfg.items():
             if v.match(token):
                 token = k
                 break
         if '{[' not in token:
             return token
         for item in cls._wordseg_pattern_cfg:
             token = item.sub('', token)
         return token
     except Exception as e:
         logger.get().warn('token=%s, errmsg=%s' % (token, e))
         return token
 def stat_token_freq(cls, in_fpath, out_fpath):
     stop_words = conf.g_stop_words_cfg
     try:
         word_counter = Counter()
         with codecs.open(in_fpath, 'r', 'utf-8') as rfd:
             for line in rfd:
                 raw_str, word_seg = line.strip('\n').split('\t')
                 tokens = word_seg.split()
                 tokens = filter(lambda x: x not in stop_words, tokens)
                 tokens = map(cls._normalize_token, tokens)
                 for t in tokens:
                     if ('{[' not in t) and len(t) <= cls._valid_token_len:
                         word_counter[t] += 1
                     else:
                         logger.get().warn('invalid token, token=%s' % (t))
                         # tokenize via jieba
                         for n_t in jieba.cut(t):
                             word_counter[n_t] += 1
                             logger.get().debug('jieba cut, token=%s' %
                                                (n_t))
         # dump word_counter
         sorted_words = sorted(word_counter.keys(),
                               key=lambda k: word_counter[k],
                               reverse=True)
         with codecs.open(out_fpath, 'w', 'utf-8') as wfd:
             for word in sorted_words:
                 tmp = '%s\t%s\n' % (word, word_counter[word])
                 wfd.write(tmp)
     except Exception as e:
         logger.get().warn('errmsg=%s' % (e))
Ejemplo n.º 8
0
def save_chat(who_send,msg):
    g_log_inst.get().debug("{}:{}".format(who_send,msg.encode("utf-8")))
 def get_word2vec(cls, w2v_fpath):
     wv = models.KeyedVectors.load_word2vec_format(w2v_fpath, binary = False)
     logger.get().info('load word2vec success, vector length: %s', 
             (wv['<s>'].shape[0]))
     return wv
Ejemplo n.º 10
0
def main(_):
    if not FLAGS.data_path:
        g_log_inst.get().error(
            'Must set --data_path to training data directory')
        return

    ckpt_dir = '../model/training-ckpt/' + FLAGS.model
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    config = get_config()
    eval_config = get_config()
    eval_config.batch_size = 1

    [train_data, valid_data,
     test_data], id2word_dict = reader.load_train_data(FLAGS.data_path)
    g_log_inst.get().info('bilstm-attention training begin')

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        with tf.variable_scope(FLAGS.model,
                               reuse=None,
                               initializer=initializer):
            m = BiLSTM_Attention_Model(is_training=True, config=config)
        with tf.variable_scope(FLAGS.model,
                               reuse=True,
                               initializer=initializer):
            mtest = BiLSTM_Attention_Model(is_training=False,
                                           config=eval_config)

        tf.initialize_all_variables().run()

        # add ops to save and restore all the variables.
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(session, ckpt.model_checkpoint_path)
            g_log_inst.get().info('[model] restore success, ckpt_path=%s' %
                                  (ckpt.model_checkpoint_path))
            save_path = saver.save(session, '%s/model.ckpt' % (ckpt_dir))
        else:
            pre_valid_perplexity = float("inf")
            learning_rate = config.learning_rate
            start_decay = False
            for i in range(config.max_epoch):
                if start_decay == True:
                    learning_rate *= config.lr_decay
                m.assign_lr(session, learning_rate)
                g_log_inst.get().info('Epoch: %d Learning rate: %.3f' %
                                      (i + 1, session.run(m.lr)))
                # shuffle the data before mini-batch training
                random.shuffle(train_data)

                # train
                train_perplexity, accuracy, _ = run_epoch(session,
                                                          m,
                                                          train_data,
                                                          m.train_op,
                                                          verbose=True)
                g_log_inst.get().info(
                    'Epoch: %d Train Perplexity: %.3f accuracy: %s' %
                    (i + 1, train_perplexity, accuracy))

                # valid
                valid_perplexity, valid_accuracy, _ = run_epoch(
                    session, mtest, valid_data, tf.no_op())

                g_log_inst.get().info(
                    'Epoch: %d Valid Perplexity: %.3f accuracy: %s' %
                    (i + 1, valid_perplexity, valid_accuracy))

                # if valid set perplexity improves too small, start lr decay
                if pre_valid_perplexity - valid_perplexity < config.perplexity_thres and start_decay:
                    start_decay = False
                    g_log_inst.get().info(
                        'Valid Perplexity increases too small, start lr decay')
                if pre_valid_perplexity < valid_perplexity:  # current epoch is rejected
                    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
                    if ckpt and ckpt.model_checkpoint_path:
                        saver.restore(session, ckpt.model_checkpoint_path)
                        g_log_inst.get().info(
                            '[model] restore success, ckpt_path=%s' %
                            (ckpt.model_checkpoint_path))
                    if learning_rate == config.learning_rate:  # if has not decay yet, give a second chance
                        continue
                    else:  # stop training
                        g_log_inst.get().info(
                            'Valid Perplexity does not increase, stop training'
                        )
                        break

                pre_valid_perplexity = valid_perplexity
                # save the variables to disk.
                save_path = saver.save(session, '%s/model.ckpt' % (ckpt_dir))
                g_log_inst.get().info('[model] save success, ckpt_path=%s' %
                                      (save_path))

        # test the accuracy
        test_perplexity, accuracy, domain_accuracy = run_epoch(
            session,
            mtest,
            test_data,
            tf.no_op(),
            debug=True,
            verbose=True,
            id2word_dict=id2word_dict,
            dsl_converter=config.converter)
        g_log_inst.get().info('Test: perplexity=%.3f, accuracy=%s' %
                              (test_perplexity, accuracy))

        # acc compute
        '''
        for idx, domain_accu in enumerate(domain_accuracy):
            g_log_inst.get().info('Domain: %s, precision: %.3f, recall: %.3f' % (
            config.converter.label2domain[idx], domain_accuracy[idx][0] / float(domain_accuracy[idx][1]),
            domain_accuracy[idx][2] / float(domain_accuracy[idx][3])))
        '''
    g_log_inst.get().info('bilstm_attention training finished')
Ejemplo n.º 11
0
def run_epoch(session,
              model,
              data,
              eval_op,
              debug=False,
              verbose=False,
              id2word_dict=None,
              dsl_converter=None):
    '''Runs the model on the given data.'''
    epoch_size = len(data) // model.batch_size
    # statistic
    start_time = time.time()
    costs = 0.0
    iters = 0
    accuracy_sum = 0.0
    domain_accuracy = [
        [0, 0, 0, 0] for i in range(model.num_labels)
    ]  # [pred1, pre2d, rec1, rec2], calculate the precision and recall at the same time

    state = session.run(model.initial_state)
    if debug:
        wrong_predict_out = codecs.open('../log/wrong_pred.txt', 'w', 'utf-8')
    y_label = []
    y_pred = []
    for step, (data_ids, sequences, labels, seq_lens) in enumerate(
            reader.pairs_iterator(data, model.batch_size, model.num_steps)):
        #print sequences
        feed_dict = {}
        feed_dict[model.input_data] = sequences
        feed_dict[model.targets] = labels
        feed_dict[model.sequence_length] = seq_lens

        if debug:
            fetches = [model.cost, model.predicts, eval_op]
            cost, predicts, _ = session.run(fetches, feed_dict)
            x_str = '_'.join(map(lambda x: str(x), sequences[0]))
            domain_predict = predicts[-1]
            domain_label = labels[0][0]
            y_label.append(dsl_converter.label2domain[int(domain_label)])
            y_pred.append(dsl_converter.label2domain[int(domain_predict)])
            domain_accuracy[domain_predict][1] += 1  # denominator of precision
            domain_accuracy[int(domain_label)][3] += 1  # denominator of recall
            accuracy = 1 if str(domain_predict) == str(domain_label) else 0
            # if right, update the pred and rec
            if accuracy == 1:
                domain_accuracy[int(
                    domain_label)][0] += 1  # nominator of precision
                domain_accuracy[domain_predict][2] += 1  # nominator of recall
            else:
                raw_str = '|'.join(
                    [str(i) for i in sequences[0][0:seq_lens[0]]])
                #wrong_predict_out.write('data_id=%s, label=%s, predict=%s, raw_str=%s\n' % (
                #data_ids, dsl_converter.label2domain[int(domain_label)], dsl_converter.label2domain[domain_predict],
                #raw_str))
            wrong_predict_out.write(
                '%s\t%s\t%s\n' %
                (str(data_ids).replace("'", '').replace('[', '').replace(
                    ']', ''), str(domain_label), str(domain_predict)))
            g_log_inst.get().debug(
                'step=%s, cost=%s, x_str=%s, predict=%s, label_idx=%s' %
                (step, cost, x_str, domain_predict, domain_label))
            g_log_inst.get().debug('predicts=%s' % (predicts))

        else:
            fetches = [model.cost, model.accuracy, eval_op]
            cost, accuracy, _ = session.run(fetches, feed_dict)
        costs += cost
        iters += model.num_steps

        accuracy_sum += accuracy
        avg_accuracy = accuracy_sum / (step + 1)

        if verbose and step % (epoch_size // 10) == 10:
            g_log_inst.get().info(
                '%.3f perplexity: %.3f speed: %.0f wps, accuracy=%.03f' %
                (step * 1.0 / epoch_size, np.exp(costs / iters), iters *
                 model.batch_size / (time.time() - start_time), avg_accuracy))

    if debug:
        y_true = y_label
        classify_report = metrics.classification_report(y_true, y_pred)
        confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
        overall_accuracy = metrics.accuracy_score(y_true, y_pred)
        acc_for_each_class = metrics.precision_score(y_true,
                                                     y_pred,
                                                     average=None)
        average_accuracy = np.mean(acc_for_each_class)
        score = metrics.accuracy_score(y_true, y_pred)
        print('classify_report : \n', classify_report)
        print('average_accuracy: {0:f}'.format(average_accuracy))
        print('overall_accuracy: {0:f}'.format(overall_accuracy))
        print('score: {0:f}'.format(score))

    if debug:
        wrong_predict_out.close()
    perplexity = np.exp(costs / iters)
    return (perplexity, avg_accuracy, domain_accuracy)
Ejemplo n.º 12
0
def build_tree(ruleset, ruleset_text):
    # basic matrics
    max_depth = 0
    max_leaf_depth = 0
    total_leaf_number = 0
    total_leaf_depth = 0
    total_mem_size = 0
    # node format: [tree-depth, parent-bit-array, msg]
    node_stack = []
    node_stack.append([0, [], ruleset, ''])  # root node

    while len(node_stack):
        curr_depth, parent_bit_array, curr_ruleset, curr_msg = node_stack.pop()

        if curr_depth > max_depth:
            max_depth = curr_depth
        avaliable_bit_array = list(
            set(range(BIT_LENGTH)) - set(parent_bit_array))

        if curr_depth == 0:
            verbose = True
        else:
            verbose = False
        bit_array, further_separable, split_info = bit_select(
            curr_ruleset, avaliable_bit_array, verbose=verbose)

        # if current non-leaf node cannnot be further splitted, turn it into
        # leaf node
        if not len(bit_array):
            g_log_inst.get().debug("change current node to leaf node")
            #for j, r in enumerate(curr_ruleset):
            #    if j == 0:
            #        result_file.write('\t' * level + new_msg + str(i) + ': ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n')
            #    else:
            #        result_file.write('\t' * level + len(new_msg + str(i) + ': ') * ' ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n')
            total_leaf_number += 1
            total_leaf_depth += curr_depth + len(curr_ruleset)
            if max_leaf_depth < curr_depth + len(curr_ruleset):
                max_leaf_depth = curr_depth + len(curr_ruleset)
            # append memory cost for storing the rules
            total_mem_size += len(curr_ruleset) * LINEAR_BUCKET_SIZE
            continue

        buckets, max_bucket_size, max_bucket_num, bucket_percentage_stat = \
            split_info
        if curr_depth == 0:
            #result_file.write("Basic bit array: %r\n\n" % bit_array)
            new_msg = curr_msg
        else:
            new_msg = curr_msg + str(bit_array) + '-'
        bit_array = bit_array + parent_bit_array
        g_log_inst.get().debug("Current length %d bit_array: %r" %
                               (len(bit_array), bit_array))

        # If rule-num of every bucket is no more than BINTH, it means every
        # bucket will become leaf node. Then this level will be regarded as a
        # bottom level
        if max_bucket_size <= BINTH:
            bottom_level = True
        else:
            bottom_level = False

        # next level
        for idx, subset in enumerate(buckets):
            # Non-leaf node
            if len(subset) > BINTH and further_separable == True:
                total_mem_size += NON_LEAF_BUCKET_STRUCTURE_SIZE
                #result_file.write('\t' * curr_depth + new_msg + str(idx)
                #    + ': \n')
                node_stack.append([curr_depth + 1, bit_array, subset, new_msg])
            # Leaf node
            else:
                #if subset:
                #    for j, r in enumerate(subset):
                #        if j == 0:
                #            result_file.write('\t' * level + new_msg + str(i) + ': ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n')
                #        else:
                #            result_file.write('\t' * level + len(new_msg + str(i) + ': ') * ' ' + ruleset_text[r[DIM_MAX][0]][:-1] + '\n')
                total_leaf_number += 1
                total_leaf_depth += curr_depth + 2 + len(subset)
                if max_leaf_depth < curr_depth + 2 + len(subset):
                    max_leaf_depth = curr_depth + 2 + len(subset)
                if bottom_level == True:
                    total_mem_size += LEAF_BUCKET_STRUCTURE_SIZE + len(subset) \
                        * LINEAR_BUCKET_SIZE
                else:
                    total_mem_size += NON_LEAF_BUCKET_STRUCTURE_SIZE + \
                        len(subset) * LINEAR_BUCKET_SIZE

        g_log_inst.get().debug("current node split finished (depth: %d)" %
                               curr_depth)

    return max_depth, max_leaf_depth, total_leaf_number, total_leaf_depth, \
        total_mem_size
Ejemplo n.º 13
0
def bit_select(ruleset,
               avaliable_bit_array,
               max_bit_array_length=float('inf'),
               use_spfac=True,
               verbose=False):
    # format: {bit: pair_dict}. Here pair_dict is the dictionary of rule
    # pairs. All the pairs this bit can separate are set to 1
    bit_pair_dict = {}
    bit_pair_size = {}  # format: {bit: pair_size}
    pair_num = 0

    # get pair info
    for bit in avaliable_bit_array:
        bit_pair_dict[bit] = bit_separation_info(ruleset, bit)
        bit_pair_size[bit] = pair_count(bit_pair_dict[bit])
        if verbose:
            g_log_inst.get().debug("bit %d : %d" % (bit, bit_pair_size[bit]))
        pair_num += bit_pair_size[bit]

    origin_rule_num = len(ruleset)
    max_bucket_size = origin_rule_num
    max_bucket_num = 1
    if pair_num == 0:
        g_log_inst.get().debug("No single bit can be selected to split")
        return [], False, []

    # select cutting bits
    bit_array = []
    further_separable = True
    while max_bucket_size > 1:
        # select the best bit in terms of "separability":
        sorted_bit_pair_size = sorted(bit_pair_size.items(),
                                      key=lambda x: x[1],
                                      reverse=True)
        # to prevent to be stucked
        if sorted_bit_pair_size[0][1] == 0:
            g_log_inst.get().debug("Cannot continue to split by single bit")
            further_separable = False
            break
        bit_selected = sorted_bit_pair_size[0][0]
        bit_array.append(bit_selected)

        # update the pair-dict
        for bit, bit_pair in bit_pair_dict.iteritems():
            if bit != bit_selected:
                pair_dict_sub(bit_pair_dict[bit], bit_pair_dict[bit_selected])
                bit_pair_size[bit] = pair_count(bit_pair_dict[bit])
        del bit_pair_dict[bit_selected]
        del bit_pair_size[bit_selected]
        buckets, max_bucket_size, max_bucket_num, bucket_percentage_stat = \
            split_ruleset(ruleset, bit_array)
        g_log_inst.get().debug("add bit %d" % bit_selected)
        g_log_inst.get().debug("max_bucket_size %d, max_bucket_num %d" %
                               (max_bucket_size, max_bucket_num))

        # Spfac calculate
        children_rule_num = 0
        children_node_num = 2**len(bit_array)
        for (k, v) in bucket_percentage_stat.items():
            children_rule_num += k * v * children_node_num
        Spfac = (children_rule_num +
                 children_node_num) / float(origin_rule_num)
        # Stopping criteria
        if len(bit_array) >= max_bit_array_length:
            break
        if use_spfac and Spfac > SPFAC:
            break

    split_info = (buckets, max_bucket_size, max_bucket_num,
                  bucket_percentage_stat)
    return bit_array, further_separable, split_info