Beispiel #1
0
    def build_network(self):
        self.word2idx, word_embedding = data_helper.load_embedding(self.FLAGS.embedding_path)

        self.inputs, self.labels, self.train_num = utils.get_batch_data(self.FLAGS.train_data_path,
                                                        self.FLAGS.batch_size, self.FLAGS.seq_length, self.word2idx)
        self.labels = tf.one_hot(self.labels, depth=self.FLAGS.class_num)

        # embedding layer
        with tf.variable_scope('embedding'):
            embedding = tf.get_variable('embedding', shape=word_embedding.shape, dtype=tf.float32,
                                        initializer=tf.constant_initializer(word_embedding), trainable=False)
            # [batch, 30, 300]
            inputs = tf.nn.embedding_lookup(embedding, self.inputs)

            self.origin = tf.reduce_mean(inputs, axis=1)

            inputs = tf.expand_dims(inputs, -1)

        with tf.variable_scope('conv1_layer'):
            # [batch, 30, 1, 300]
            input_conv = tf.layers.conv2d(inputs, self.FLAGS.filter_num,
                                          [self.FLAGS.filter_size, self.FLAGS.embedding_size])

            # Primary Capsules layer
            with tf.variable_scope('PrimaryCaps_layer'):
                primaryCaps = CapsLayer(num_outputs=30, vec_len=10, with_routing=False, layer_type='CONV')

                caps1 = primaryCaps(input_conv, kernel_size=3, stride=1)


        # DigitCaps layer, return [batch_size, 2, vec_len, 1]
        with tf.variable_scope('DigitCaps_layer'):
            digitCaps = CapsLayer(num_outputs=self.FLAGS.class_num, vec_len=32, with_routing=True, layer_type='FC')
            # [batch_size, 2, 100, 1]
            self.caps2 = digitCaps(caps1)

        # Decoder structure
        # 1. Do masking, how:
        with tf.variable_scope('Masking'):
            # [batch_size, class_num, vec_len, 1] => [batch_size, class_num, 1, 1]
            self.v_length = tf.sqrt(tf.reduce_sum(tf.square(self.caps2),
                                               axis=2, keep_dims=True) + epsilon)
            # [batch_size, class_num, 1, 1]
            self.softmax_v = tf.nn.softmax(self.v_length, dim=1)


            # [batch_size, class_num, 1, 1] => [batch_size] (index)
            self.argmax_idx = tf.to_int32(tf.argmax(self.softmax_v, axis=1))
            self.argmax_idx = tf.reshape(self.argmax_idx, shape=[-1])

            # Method 1.
            if not mask_with_y:
                # indexing

                # [batch_size, class_num, 1, 1]
                one_hot_idx = tf.expand_dims(tf.expand_dims(tf.one_hot(self.argmax_idx, self.FLAGS.class_num), -1), -1)
                # [batch_size, 1, vec_len, 1]
                self.masked_v = tf.reduce_sum(self.caps2 * one_hot_idx, 1)

            # Method 2. masking with true label, default mode
            else:

                self.masked_v = tf.multiply(tf.squeeze(self.caps2, 3),
                                            tf.reshape(self.labels, (-1, self.FLAGS.class_num, 1)))
                # [batch_size, class_num, 1]
                self.v_length = tf.sqrt(tf.reduce_sum(tf.square(self.caps2), axis=2) + epsilon)

        # 2. Reconstructe the data with 3 FC layers
        # [batch_size, 2, vec_len, 1] => [batch_size, 64] => [batch_size, 300]
        with tf.variable_scope('Decoder'):
            vector_j = tf.reshape(self.masked_v, shape=(-1, self.FLAGS.class_num * 32))
            fc1 = tf.contrib.layers.fully_connected(vector_j, num_outputs=256)

            fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=512)

            self.decoded = tf.contrib.layers.fully_connected(fc2, num_outputs=300, activation_fn=tf.tanh)
logger.setLevel(logging.INFO)

fh = logging.FileHandler("./run.log" + suffix, mode="w")
fh.setLevel(logging.INFO)

fmt = "%(asctime)-15s %(levelname)s %(filename)s %(lineno)d %(process)d %(message)s"
datefmt = "%a %d %b %Y %H:%M:%S"
formatter = logging.Formatter(fmt, datefmt)

fh.setFormatter(formatter)
logger.addHandler(fh)

#----------------------------- define a logger end ----------------------------------

#------------------------------------load data -------------------------------
embedding, word2idx, idx2word = load_embedding(FLAGS.embedding_file,
                                               FLAGS.embedding_size)

ori_quests, cand_quests, neg_quests, cat_ids = load_train_data(
    FLAGS.train_file, word2idx, FLAGS.num_unroll_steps)

test_ori_quests, test_cand_quests, labels, results, test_cat_ids = load_test_data(
    FLAGS.test_file, word2idx, FLAGS.num_unroll_steps)

for_test_ori_quests, for_test_cand_quests, for_labels, for_results, for_test_cat_ids = load_test_data(
    FLAGS.train_LONG, word2idx, FLAGS.num_unroll_steps)
#test_like_train_ori_quests, test_like_train_cand_quests,test_like_train_neg_quests,test_like_train_cat_ids = load_train_data(FLAGS.test_TRAIN, word2idx, FLAGS.num_unroll_steps)

#----------------------------------- load data end ----------------------


def onehot_encoder(cat_ids_batch):
Beispiel #3
0
        total_busi_correct_num += busi_correct_num
        total_other_num += other_num
        total_other_correct_num += other_correct_num

    accuracy=float(total_correct_num)/total_num
    busi_acc = float(total_busi_correct_num) / total_busi_num
    other_acc = float(total_other_correct_num) / total_other_num
    logger.info("validation success")

    return accuracy, busi_acc, other_acc
#------------------------------ evaluate model end -------------------------------------

#------------------------------------load data -------------------------------
label2id, id2label = load_label(FLAGS.label_file)
id2word, word2id = build_vocab(FLAGS.word_file)
embeddings = load_embedding(FLAGS.embedding_file)
logger.info("load label, word, embedding finished")
train_valid_x, train_valid_y, class_weight = load_data(FLAGS.train_file, word2id, label2id, FLAGS.num_unroll_steps)

# cal label weight
class_weight_mean = np.mean(class_weight.values())
label_weight = {}
for label, weight in class_weight.items():
    label_weight[label2id.get(label)] = 1. / (weight / class_weight_mean)
label_weight = np.array([label_weight[ix] for ix in sorted(label_weight.keys())], dtype=np.float32)

# split data
train_data, valid_data = split_train_by_ratio(zip(train_valid_x, train_valid_y), 0.01)
train_x, train_y = zip(*train_data)
valid_x, valid_y = zip(*valid_data)
logger.info("load train data finish")
Beispiel #4
0
        builder.add_meta_graph_and_variables(sess, ['training_model'])
        builder.save()


if __name__ == '__main__':
    #
    train_data_file = sys.argv[1]

    # load data
    print("Loading data...")
    # TODO: 文件位置从外面读取
    filename = os.path.join(os.path.join(cf.data_path, train_data_file))

    X, Y, sentiments, patterns = data_helper.map_file_to_ids(filename=filename)
    vocab, ivocab, label_vocab, label_ivocab = data_helper.load_vocab(filename)
    embeddings = data_helper.load_embedding(cf.word2vec_path, vocab)
    num_classes = len(label_vocab)
    vocab_size = len(vocab)
    x_fold = cf.num_fold

    # 不进行交叉验证
    if x_fold == 0:
        data_size = len(Y)
        size_per_fold = int(data_size / 10)
        dev_start = 0 * size_per_fold
        dev_end = (0 + 1) * size_per_fold
        x_train, y_train = X[dev_end:], Y[dev_end:]
        sents_train = sentiments[dev_end:]
        patt_train = patterns[dev_end:]
        # dev set
        x_dev = X[dev_start:dev_end]
Beispiel #5
0
filter_num = 300
batch_size = 64
epochs_num = 32
drop_out_rate = 0.5
regularizer_rate = 0.004

tracks = ['AR-AR', 'AR-EN', 'SP-SP', 'SP-EN', 'SP-EN-WMT', 'EN-EN', 'EN-TR']


print ("loading data...")
train_sources, train_targets, train_scores = data_helper.load_cross_lang_sentence_data(train_path, True)
dev_sources, dev_targets, dev_scores = data_helper.load_cross_lang_sentence_data(dev_path, True)
test_sources, test_targets, test_scores = data_helper.load_cross_lang_sentence_data(test_path, False)


word2idx, word_embeddings = data_helper.load_embedding(embedding_path, True)


# word to id
train_sources, train_sources_length = utils.word2id(train_sources, word2idx, seq_length)
train_targets, train_targets_length = utils.word2id(train_targets, word2idx, seq_length)


dev_sources, dev_sources_length = utils.word2id(dev_sources, word2idx, seq_length)
dev_targets, dev_targets_length = utils.word2id(dev_targets, word2idx, seq_length)

test_sources, test_sources_length = utils.word2id(test_sources, word2idx, seq_length)
test_targets, test_targets_length = utils.word2id(test_targets, word2idx, seq_length)


train_score_probs = utils.build_porbs(train_scores, class_num)
Beispiel #6
0
FLAGS = tf.flags.FLAGS
filter_size = [int(each) for each in FLAGS.filter_size.split(",")]
#----------------------------- define parameter end ----------------------------------

#----------------------------- define a logger -------------------------------
logger = logging.getLogger("execute")
logger.setLevel(logging.INFO)

fh = logging.FileHandler("./run.log", mode="w")
fh.setLevel(logging.INFO)
logger.addHandler(fh)
#----------------------------- define a logger end ----------------------------------

#------------------------------------load data -------------------------------
embedding, word2idx, idx2word = load_embedding(FLAGS.embedding_file,
                                               FLAGS.embedding_size)
char_embedding, char2idx, idx2char = load_embedding(FLAGS.char_embedding_file,
                                                    FLAGS.embedding_size)
ori_quests, cand_quests, labels, ori_quests_char, cand_quests_char = load_train_data(
    FLAGS.train_file, char2idx, FLAGS.char_len)
total_sents = load_log(FLAGS.log_file)
logger.info("load log data success")
unknown_id = word2idx.get("UNKNOWN", 0)
ori_sents, cand_sents, features = cal_basic_feature(
    ori_quests,
    cand_quests,
    total_sents,
    embedding,
    word2idx,
    saveTfidf=True,
    tfidfFile=FLAGS.tfidf_file,
            logging.info("%s, the valid accuracy is %f"%(timestr, valid_accuracy))

        global_steps += 1

    return global_steps
#---------------------------- run epoch end -------------------------------------


#------------------------------------load data -------------------------------
word2idx, idx2word = build_vocab(FLAGS.word_file)
label2idx, idx2label = load_label(FLAGS.label_file)
train_x, train_y, train_mask = load_data(FLAGS.train_file, word2idx, label2idx, FLAGS.sequence_len)
logging.info("load train data finish")
train_data, valid_data = create_valid(zip(train_x, train_y, train_mask))
num_classes = len(label2idx)
embedding = load_embedding(FLAGS.embedding_size, filename=FLAGS.embedding_file)
test_x, test_y, test_mask = load_data(FLAGS.test_file, word2idx, label2idx, FLAGS.sequence_len)
logging.info("load test data finish")
#----------------------------------- load data end ----------------------

#----------------------------------- execute train ---------------------------------------
with tf.Graph().as_default():
    with tf.device("/cpu:0"):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_options)
        session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options)
        with tf.Session(config=session_conf).as_default() as sess:
            initializer = tf.random_uniform_initializer(-1 * FLAGS.init_scale, 1 * FLAGS.init_scale)
            with tf.variable_scope("model", reuse = None, initializer = initializer):
                model = LSTM(FLAGS.batch_size, FLAGS.sequence_len, embedding, FLAGS.embedding_size, FLAGS.attention_dim, FLAGS.rnn_size, FLAGS.num_rnn_layers, num_classes, FLAGS.max_grad_norm, dropout = FLAGS.dropout, is_training=True)

            with tf.variable_scope("model", reuse = True, initializer = initializer):