Beispiel #1
0
 def __init__(self, args):
     self.num_updates = 0
     self.args = args
     self.word2vec = load_word2vec(args.word2vec_path)
     self._build_loader()
     self._build_model()
     self._build_optimizer()
Beispiel #2
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)

        embeds_path = os.path.join("data", "twitter_mf.clean.npy")
        if not os.path.isfile(embeds_path):
            word2vec = utils.load_word2vec()
            embedding_matrix = np.random.uniform(low=-1.0,
                                                 high=1.0,
                                                 size=(self.hparams.vocab_size,
                                                       300))
            with open(os.path.join("data", "twitter_mf.clean.vocab")) as f:
                for i, word in enumerate(f):
                    word = word.strip()
                    if word in word2vec:
                        embedding_matrix[i] = word2vec[word]
            np.save(embeds_path, embedding_matrix)
            del word2vec

        else:
            embedding_matrix = np.load(embeds_path)

        self.embed = tfkl.Embedding(
            self.hparams.vocab_size,
            300,
            embeddings_initializer=tf.initializers.constant(embedding_matrix),
        )
        self.embed.trainable = self.hparams.fine_tune_embeds

        self.encoder = self.make_encoder()
        self.encoder.add(tfkl.Dense(6, activation=tf.math.tanh))
Beispiel #3
0
    def __init__(self,
                 tokens,
                 bos='__begin__',
                 eos='__end__',
                 unk='__unknown__',
                 use_w2v=True,
                 w2v_dim=300):
        self.token_to_idx = {token: i for i, token in enumerate(tokens)}
        self.use_w2v = use_w2v
        self.bos = bos
        self.eos = eos
        if unk is not None:
            self.unk = unk
            self.unk_idx = self.token_to_idx[unk]
        self.bos_idx = self.token_to_idx[bos]
        self.eos_idx = self.token_to_idx[eos]

        self.idx_to_token = {i: token for i, token in enumerate(tokens)}
        if use_w2v:
            word2vec = utils.load_word2vec(
                'ruwikiruscorpora_upos_skipgram_300_2_2018.vec')
            self.word2vec_tokens = np.zeros((len(tokens), w2v_dim))
            self.w2v_dim = w2v_dim
            for token, idx in self.token_to_idx.items():
                if token in word2vec:
                    self.word2vec_tokens[idx] = word2vec[token]
                else:
                    self.word2vec_tokens[idx] = np.random.normal(w2v_dim)
Beispiel #4
0
 def __init__(self, args, model):
     super(Model, self).__init__()
     self.embeddings_matrix = load_word2vec(args)
     self.embeddings = tf.keras.layers.Embedding(
         args.vocab_size,
         args.embedding_dim,
         weights=[self.embeddings_matrix],
         trainable=False)
     self.encoder = model
     self.dense = tf.keras.layers.Dense(
         units=args.vocab_size,
         activation='softmax',
         #input_shape=(args.max_seq_len, args.embedding_dim)
     )
     self.dense2 = tf.keras.layers.Dense(
         args.hidden_size,
         kernel_initializer=tf.keras.initializers.TruncatedNormal(
             stddev=0.02))
     self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12)
     self.act = tf.keras.layers.Activation(gelu)
     self.bias = self.add_weight(shape=(args.vocab_size, ),
                                 initializer="zeros",
                                 trainable=True,
                                 name="bias")
     self.word_embeddings = self.add_weight(
         "weight",
         shape=[args.vocab_size, args.hidden_size],
         initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
     )
def save_or_load_embeds(embeds_path, vocab_path, vocab_size):
    """Load or build and embedding matrix from a TSV file."""

    should_save = False

    if not os.path.isfile(embeds_path):
        embedding_matrix = np.random.uniform(low=-1.0,
                                             high=1.0,
                                             size=(vocab_size, 300))
        should_save = True
    else:
        embedding_matrix = np.load(embeds_path)
        # Check if the vocab sizes match. If the saved matrix is missing words, a new
        # matrix file is needed.
        if len(embedding_matrix) < vocab_size:
            should_save = True

    if should_save:
        w2v = utils.load_word2vec()
        with open(vocab_path) as f:
            for i, word in enumerate(f):
                word = word.strip()
                if word in w2v:
                    embedding_matrix[i] = w2v[word]

        np.save(embeds_path, embedding_matrix)

    return embedding_matrix[:vocab_size]
Beispiel #6
0
def load_data(arg):
    if arg.glove:
        utils.log('Loading glove..', arg.id)
        word_embedding = utils.load_glove_vector()
    else:
        utils.log('Loading word2vec..', arg.id)
        word_embedding = utils.load_word2vec()
    return word_embedding
Beispiel #7
0
 def __init__(self, args, model_roberta):
     super(Model_Roberta, self).__init__()
     self.encoder = model_roberta
     self.args = args
     self.embeddings_matrix = load_word2vec(args)
     self.embeddings = tf.keras.layers.Embedding(
         args.vocab.word_size(),
         args.embedding_dim,
         weights=[self.embeddings_matrix],
         trainable=False)
Beispiel #8
0
def train(ex_id, restore=False):
    global dataset
    output_path = ex_id + '/'
    print(output_path)
    utils.create_folder(output_path)
    train_data = utils.load_conll_data(dataset['train'])
    train_range = len(train_data)
    print('Train dataset: %d' % (train_range))

    word2vec_model = utils.load_word2vec()
    global FLAGS, tf_config
    with tf.Graph().as_default(), tf.Session() as session:
        # with tf.Graph().as_default(), tf.Session(config=tf_config) as session:
        initializer = tf.random_uniform_initializer(-0.1, 0.1)
        with tf.variable_scope("RNN", reuse=None, initializer=initializer):
            utils.log("Building model.. ", ex_id)
            i_train = BiEncoderDecoderModel(is_training=True, FLAGS=FLAGS)
            start_epoch = 0
            if restore:
                start_epoch = max(int(i) for i in os.listdir(arg.id)) - 1
                if start_epoch > 0:
                    print('Restoring model: %s...' % (start_epoch))
                    model_file_path = os.path.join(ex_id, str(start_epoch),
                                                   'model.ckpt')
                    i_train.saver.restore(session, model_file_path)
                else:
                    utils.log('No saved model, initialize all variables...',
                              ex_id)
                    tf.global_variables_initializer().run()
            else:
                tf.global_variables_initializer().run()
            for epoch in range(start_epoch + 1, 150):
                print("Epoch: %d" % (epoch), ex_id)
                epoch_output_path = os.path.join(output_path, str(epoch))
                utils.create_folder(epoch_output_path)

                train_cost = 0.0
                per = np.random.permutation(train_range)
                for i, index in enumerate(per):
                    inputs, labels, name, sentence_len = get_sample(
                        word2vec_model, train_data, index)
                    start = time.time()
                    cost, predicts, feature = i_train.train(
                        session, inputs, labels, sentence_len)
                    train_cost += cost
                    if i % 100 == 0:
                        print('Time: %f\r' % ((time.time() - start) / 100),
                              end='')
                        utils.update_epoch(epoch, i, ex_id)
                print('Train: ' + str(train_cost), ex_id)
                model_file_path = os.path.join(epoch_output_path, 'model.ckpt')
                i_train.save_model(session, model_file_path)
Beispiel #9
0
def read_word_embed(vocab):
    """Google word2vec 300 dim"""
    existing_embed = utils.load_word2vec(WORD2VEC_FILE, vocab)
    word_embed = [None] * len(vocab)
    for word in vocab:
        index = vocab[word]
        try:
            embed = np.array(existing_embed[word], dtype=np.float32)
        except KeyError:
            embed = np.random.uniform(low=-0.25, high=0.25,
                                      size=[300]).tolist()
        word_embed[index] = embed
    return np.array(word_embed)
Beispiel #10
0
    def train(self):
        steps_per_epoch = self.train_batcher.num_batches

        with tf.Session(config=tf_config) as sess:
            ### 1. create model and load parameters
            self.model = Model(self.config)
            ckpt = tf.train.get_checkpoint_state(FLAGS.ckpt_path)
            if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
                logger.info("Reading model parameters from %s" %
                            ckpt.model_checkpoint_path)
                self.model.saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                logger.info("Created model with fresh parameters.")
                sess.run(tf.global_variables_initializer())
                if self.config["pre_emb"]:  # load pre-trained word vec params
                    # load word vec
                    embed_weights = sess.run(
                        self.model.char_lookup.read_value())
                    embed_weights = utils.load_word2vec(
                        FLAGS.emb_path, self.id_2_ch, FLAGS.char_dim,
                        embed_weights)
                    sess.run(self.model.char_lookup.assign(embed_weights))
                    logger.info("Loaded pre-trained embedding.")

            ### 2. training
            logger.info(" => Start training...")
            loss = []
            with tf.device("/gpu:0"):
                for i in range(FLAGS.epochs):
                    for batch in self.train_batcher.iter_batch(shuffle=False):
                        step, batch_loss = self.model.run_step()
                        loss.append(batch_loss)
                        if step % FLAGS.steps_check == 0:
                            iteration = step // steps_per_epoch + 1
                            logger.info(
                                ">>> Epoch:{}, iteration:{}, step:{}/{}, Batch mean loss:{:>9.6f}"
                                .format(i + 1, iteration,
                                        step % steps_per_epoch,
                                        steps_per_epoch, np.mean(loss)))
                            loss = []
                    # evaluate at dev set every epoch
                    self.eval(sess, "dev", self.dev_bathcer)
                    # save model
                    if i % 8 == 0:
                        self.model.save_model(sess,
                                              FLAGS.ckpt_path,
                                              name="train_ner.ckpt")
                        logger.info("=> Model saved. ")
                # evaluate at test set
                self.eval(sess, "test", self.test_batcher)
Beispiel #11
0
def make_feature(ex_id, epoch):
    global dataset
    train_data = utils.load_conll_data_as_dict(dataset['train'])
    dev_data = utils.load_conll_data_as_dict(dataset['dev'])
    test_data = utils.load_conll_data_as_dict(dataset['test'])
    word2vec_model = utils.load_word2vec()

    global FLAGS, tf_config
    with tf.Graph().as_default(), tf.Session(config=tf_config) as session:
        initializer = tf.random_uniform_initializer(-0.1, 0.1)
        with tf.variable_scope("RNN", reuse=False, initializer=initializer):
            print('Building model..')
            i_test = BiEncoderDecoderModel(is_training=False)

            epoch_output_path = os.path.join(ex_id, epoch)
            print('Restoring model: %s...' % (epoch))
            model_file_path = os.path.join(epoch_output_path, 'model.ckpt')
            i_test.saver.restore(session, model_file_path)

            # Test model
            test_cost = 0.0
            print('Starting test..')
            start = time.time()

            for idx, data in enumerate([train_data, dev_data, test_data]):
                output = {}
                for name, sample in data.items():
                    inputs, labels, _, sentence_len = get_sample(
                        word_embedding, [sample], 0)
                    cost, predicts, feature = i_test.test(
                        session, inputs, labels, sentence_len)
                    test_cost += cost
                    output[name] = predicts
                if idx == 0:
                    print('Saving train feature')
                    file_path = os.path.join(epoch_output_path,
                                             'train.feature')
                elif idx == 1:
                    print('Saving dev feature')
                    file_path = os.path.join(epoch_output_path, 'dev.feature')
                else:
                    print('Saving test feature')
                    file_path = os.path.join(epoch_output_path, 'test.feature')
                save_output(file_path, output)

    print('DONE!')
Beispiel #12
0
    def get_embedding(self, inputs, id_to_word):
        # embedding layer for input projection
        with tf.variable_scope("Embedding"), tf.device('/cpu:0'):
            if not self.params.pre_emb:
                embedding = tf.get_variable(
                    "word_emb", [self.num_words, self.params.word_dim],
                    initializer=init_ops.uniform_unit_scaling_initializer())
            else:
                print("load word2vec")
                embedding = tf.get_variable(
                    "word_emb",
                    dtype=tf.float32,
                    initializer=np.asarray(load_word2vec(
                        self.params.pre_emb, id_to_word),
                                           dtype=np.float32))

        x = tf.nn.embedding_lookup(embedding, inputs)
        return x
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)

        # TODO: get rid of this, use `TextVectorization`.
        embeds_path = os.path.join("data", "twitter_mf.clean.npy")
        if not os.path.isfile(embeds_path):
            word2vec = utils.load_word2vec()
            embedding_matrix = np.random.uniform(low=-1.0,
                                                 high=1.0,
                                                 size=(self.hparams.vocab_size,
                                                       300))
            with open(os.path.join("data", "twitter_mf.clean.vocab")) as f:
                for i, word in enumerate(f):
                    word = word.strip()
                    if word in word2vec:
                        embedding_matrix[i] = word2vec[word]
            np.save(embeds_path, embedding_matrix)
            del word2vec

        else:
            embedding_matrix = np.load(embeds_path)

        # TODO: get rid of this, use `TextVectorization` and figure out how to
        # equivalently incorporate `fine_tune_embeds` .
        self.embed = tfkl.Embedding(
            self.hparams.vocab_size,
            300,
            embeddings_initializer=tf.initializers.constant(embedding_matrix),
        )
        self.embed.trainable = self.hparams.fine_tune_embeds

        self.encoder = self.make_encoder()

        # The "non-moral" axis is actually between 0 and 1, and only 1 when the rest of
        # the components are 0.
        if self.hparams.normalize_nonmoral:
            self.encoder.add(tfkl.Dense(5, activation=tf.math.tanh))
            self.encoder.add(tfkl.Lambda(half_sphere))
        else:
            self.encoder.add(tfkl.Dense(6, activation=tf.math.tanh))
def train():
    with tf.device('/cpu:0'):
        x_text, y, pos1, pos2 = data_helpers.load_data(FLAGS.train_path)

    # 建立词映射表
    # Example: x_text[k] = 'the e11 factory e12 products have included flower pots finnish rooster'
    # =>[1  2  4  3  5  6  7  8  9 10 11]
    # =>[1 2 4 3 5 6 7 8 9 10 11 0  0 ... 0 0]

    text_tokenizer = keras.preprocessing.text.Tokenizer()
    text_tokenizer.fit_on_texts(x_text)
    x_text = text_tokenizer.texts_to_sequences(x_text)
    x = keras.preprocessing.sequence.pad_sequences(x_text,
                                                   FLAGS.max_sentence_length,
                                                   padding='post')

    text_vocab_size = len(text_tokenizer.word_index)
    print("Text vocabulary size:{}".format(text_vocab_size))
    print("x shape={0}".format(x.shape))
    print("y shape={0}".format(y.shape))
    print("")

    # 建立位置向量
    # pos1[k] = ['32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55']
    # => [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
    #    0  0  0  0  0  0  0  0  0  0  0]
    pos_tokenizer = keras.preprocessing.text.Tokenizer()
    pos_tokenizer.fit_on_texts(pos1 + pos2)
    p1 = pos_tokenizer.texts_to_sequences(pos1)
    p2 = pos_tokenizer.texts_to_sequences(pos2)
    p1 = keras.preprocessing.sequence.pad_sequences(p1,
                                                    FLAGS.max_sentence_length,
                                                    padding='post')
    p2 = keras.preprocessing.sequence.pad_sequences(p2,
                                                    FLAGS.max_sentence_length,
                                                    padding='post')

    pos_vocab_size = len(pos_tokenizer.word_index)
    print("Position vocabulary size:{}".format(pos_vocab_size))
    print("pos_1 shape={0}".format(p1.shape))
    print("pos_2 shape={0}".format(p2.shape))
    print("")

    # 随机打乱数据然后分为训练和测试数据
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    p1_shuffled = p1[shuffle_indices]
    p2_shuffled = p2[shuffle_indices]
    dev_sample_index = -1 * int(float(len(y)) * FLAGS.dev_sample_percentage)
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[
        dev_sample_index:]
    p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[
        dev_sample_index:]
    print("Train/Dev split:{0}/{1}".format(len(y_train), len(y_dev)))
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(x.shape[1], y.shape[1], text_vocab_size + 1,
                          pos_vocab_size + 1, FLAGS.text_embedding_dim,
                          FLAGS.pos_embedding_dim,
                          list(map(int, FLAGS.filter_sizes.split(","))),
                          FLAGS.num_filters, FLAGS.l2_reg_lambda)

            # 定义训练步骤
            global_step = tf.Variable(0, trainable=False, name='global_step')
            optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
            # optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
            # op.minimize()的第一步 拆开以梯度修剪
            gvs = optimizer.compute_gradients(cnn.loss)
            # 梯度修剪
            capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var)
                          for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs,
                                                 global_step=global_step)

            # 输出路径
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            # 记录
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            accuracy_summary = tf.summary.scalar("accuracy", cnn.accuracy)
            # 训练记录
            train_summary_op = tf.summary.merge(
                [loss_summary, accuracy_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)
            # 验证记录
            dev_summary_op = tf.summary.merge([loss_summary, accuracy_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)
            # checkoutpoint 输出
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # 保存文本和位置的映射表
            with open(os.path.join(out_dir, 'text_tokenizer.json'), 'w') as js:
                json.dump(text_tokenizer.word_index, js)
            with open(os.path.join(out_dir, 'pos_tokenizer.json'), 'w') as js:
                json.dump(pos_tokenizer.word_index, js)

            # 初始化所有参数
            sess.run(tf.global_variables_initializer())

            # 预训练
            if FLAGS.embedding_path:
                Pretrained_W = utils.load_word2vec(FLAGS.embedding_path,
                                                   FLAGS.text_embedding_dim,
                                                   text_tokenizer)
                sess.run(cnn.W_text.assign(Pretrained_W))
                print("Load Pretrained Embedding Success!")
            # 生成batch训练数据
            data = list(zip(x_train, p1_train, p2_train, y_train))
            batches = data_helpers.batch_iter(data, FLAGS.batch_size,
                                              FLAGS.num_epochs, True)
            best_f1 = 0.0
            cnt_epoch = 0
            cnt_batch = 0
            for batch_and_per_batches in batches:
                batches_per_epoch = batch_and_per_batches[1]
                batch = batch_and_per_batches[0]
                cnt_batch = cnt_batch + 1
                x_batch, p1_batch, p2_batch, y_batch = zip(*batch)
                feed_dic = {
                    cnn.input_text: x_batch,
                    cnn.input_p1: p1_batch,
                    cnn.input_p2: p2_batch,
                    cnn.input_y: y_batch,
                    cnn.drop_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run(
                    [
                        train_op, global_step, train_summary_op, cnn.loss,
                        cnn.accuracy
                    ],
                    feed_dict=feed_dic)
                train_summary_writer.add_summary(summaries, step)
                if cnt_batch == batches_per_epoch:
                    cnt_epoch = cnt_epoch + 1
                    feed_dict = {
                        cnn.input_text: x_dev,
                        cnn.input_p1: p1_dev,
                        cnn.input_p2: p2_dev,
                        cnn.input_y: y_dev,
                        cnn.drop_keep_prob: 1.0
                    }
                    summaries, loss, accuracy, predictions = sess.run([
                        dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions
                    ], feed_dict)
                    dev_summary_writer.add_summary(summaries, step)

                    f1 = f1_score(np.argmax(y_dev, 1),
                                  predictions,
                                  labels=np.array(range(1, 19)),
                                  average='macro')
                    print("epoch {0} --- loss: {1}  acc:{2}  f1:{3}".format(
                        cnt_epoch, loss, accuracy, f1))
                    if best_f1 < f1:
                        best_f1 = f1
                        path = saver.save(sess,
                                          checkpoint_prefix +
                                          "-{:.3f}".format(best_f1),
                                          global_step=step)
                        print("Model saved to {}".format(path))
                    cnt_batch = 0
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser(description="-----[Reinforced Visual Semantic Embedding ]-----")
    parser.add_argument('--dataset', default='digit', help='Dataset. (vse | mr | digit)')
    root_args = parser.parse_args(sys.argv[1:3])
    dataset = root_args.dataset

    parser = argparse.ArgumentParser(description="-----[Reinforced Visual Semantic Embedding ]-----")
    if dataset == 'vse':
        # Common params, but specifying each under each dataset-if to make the default values different
        parser.add_argument("--hidden_size",        default=512,     type=int,  help="Size of hidden layer in deep RL")
        parser.add_argument("--episodes",           default=10000,  type=int,   help="number of episodes")
        parser.add_argument("--learning_rate_rl",   default=0.1,   type=float,  help="learning rate")
        parser.add_argument('--margin',             default=0.2,    type=float, help='Rank loss margin.')
        parser.add_argument('--num_epochs',         default=20,     type=int,   help='Number of reward calculation epochs.')
        parser.add_argument('--full_epochs',        default=30,     type=int,   help='Number of training epochs.')
        parser.add_argument('--init_samples',       default=224,    type=int,  help='number of random inital training data')
        parser.add_argument('--batch_size',         default=128,    type=int,   help='Size of a training mini-batch.')
        parser.add_argument('--budget',             default=1120,   type=int,   help='Our labeling budget')
        parser.add_argument('--selection_radius',   default=32,     type=int,   help='Selection radius')
        parser.add_argument("--reward_threshold",   default=0,      type=float, help="Reward threshold")
        parser.add_argument('--scorefn',            default='intra',type=str,   help='Score FN for traditional active learning')
        parser.add_argument('--w2v',                action='store_true',        help='Use w2v embeddings')

        # VSE specific params
        parser.add_argument('--embed_size',         default=1024,   type=int,   help='Dimensionality of the joint embedding.')
        parser.add_argument('--word_dim',           default=300,    type=int,   help='Dimensionality of the word embedding.')
        parser.add_argument('--num_layers',         default=1,      type=int,   help='Number of GRU layers.')
        parser.add_argument('--grad_clip',          default=2.,     type=float, help='Gradient clipping threshold.')
        parser.add_argument('--crop_size',          default=224,    type=int,   help='Size of an image crop as the CNN input.')
        parser.add_argument('--learning_rate_vse',  default=.0002,  type=float, help='Initial learning rate.')
        parser.add_argument('--lr_update',          default=10,     type=int,   help='Number of epochs to update the learning rate.')
        parser.add_argument('--workers',            default=10,     type=int,   help='Number of data loader workers.')
        parser.add_argument('--log_step',           default=10,     type=int,   help='Number of steps to print and record the log.')
        parser.add_argument('--val_step',           default=500,    type=int,   help='Number of steps to run validation.')
        parser.add_argument('--img_dim',            default=4096,   type=int,   help='Dimensionality of the image embedding.')
        parser.add_argument('--cnn_type',           default='vgg19',type=str,   help="""The CNN used for image encoder(e.g. vgg19, resnet152)""")
        parser.add_argument('--topk',               default=10,     type=int,   help='Topk similarity to use for state')
        parser.add_argument('--topk_image',         default=0,      type=int,   help='Topk similarity images to use for state')
        parser.add_argument('--data_name',          default='f8k_precomp',      help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
        parser.add_argument('--measure',            default='cosine',           help='Similarity measure used (cosine|order)')
        parser.add_argument('--intra_caption',      action='store_true',        help='Include closest captions intra distance in state')
        parser.add_argument('--max_violation',      action='store_true',        help='Use max instead of sum in the rank loss.')
        parser.add_argument('--image_distance',     action='store_true',        help='Include image distance in the state ')
        parser.add_argument('--use_abs',            action='store_true',        help='Take the absolute value of embedding vectors.')
        parser.add_argument('--no_imgnorm',         action='store_true',        help='Do not normalize the image embeddings.')
        parser.add_argument('--finetune',           action='store_true',        help='Fine-tune the image encoder.')
        parser.add_argument('--use_restval',        action='store_true',        help='Use the restval data for training on MSCOCO.')
        # parser.add_argument('--resume',             default='',    type=str, metavar='PATH', help='path to latest checkpoint (default: none)')

    elif dataset == 'mr':
        parser.add_argument('--hidden_size',        default=320,    type=int,   help='Size of hidden layer in deep RL')
        parser.add_argument('--episodes',           default=10000,  type=int,   help='Number of episodes')
        parser.add_argument('--learning_rate_rl',   default=0.1,    type=float, help='learning rate')
        parser.add_argument('--margin',             default=0.2,    type=float, help='Rank loss margin.')
        parser.add_argument('--num_epochs',         default=15,     type=int,   help='Number of training epochs.')
        parser.add_argument('--full_epochs',        default=15,     type=int,   help='Number of training epochs.')
        parser.add_argument('--init_samples',       default=480,    type=int,   help='Number of random inital training data')
        parser.add_argument('--batch_size',         default=128,    type=int,   help='Size of a training mini-batch.')
        parser.add_argument('--budget',             default=5,      type=int,   help='Our labeling budget')
        parser.add_argument('--selection_radius',   default=32,     type=int,   help='Selection radius')
        parser.add_argument('--reward_threshold',   default=0,      type=float, help='Reward threshold')
        parser.add_argument('--w2v',                action='store_true',        help='Use w2v embeddings')

    elif dataset == 'digit':
        parser.add_argument('--hidden_size',        default=10,     type=int,   help='Size of hidden layer in deep RL')
        parser.add_argument('--episodes',           default=10000,  type=int,   help='Number of episodes')
        parser.add_argument('--learning_rate_rl',   default=0.1,    type=float, help='learning rate')
        parser.add_argument('--margin',             default=0.2,    type=float, help='Rank loss margin.')
        parser.add_argument('--num_epochs',         default=50,    type=int,    help='Number of training epochs.')
        parser.add_argument('--full_epochs',        default=50,    type=int,    help='Number of training epochs.')
        parser.add_argument('--init_samples',       default=5,      type=int,   help='Number of random inital training data')
        parser.add_argument('--batch_size',         default=128,    type=int,   help='Size of a training mini-batch.')
        parser.add_argument('--budget',             default=30,     type=int,   help='Our labeling budget')
        parser.add_argument('--selection_radius',   default=1,      type=int,   help='Selection radius')
        parser.add_argument("--reward_threshold",   default=0,      type=float, help="Reward threshold")
        parser.add_argument('--w2v',                action='store_true',        help='Use w2v embeddings')

    elif dataset == 'mnist':
        parser.add_argument('--hidden_size',        default=320,    type=int,   help='Size of hidden layer in deep RL')
        parser.add_argument('--episodes',           default=10000,  type=int,   help='Number of episodes')
        parser.add_argument('--learning_rate_rl',   default=0.1,    type=float, help='Learning rate')
        parser.add_argument('--margin',             default=0.2,    type=float, help='Rank loss margin.')
        parser.add_argument('--num_epochs',         default=15,     type=int,   help='Number of training epochs.')
        parser.add_argument('--full_epochs',        default=15,     type=int,   help='Number of training epochs.')
        parser.add_argument('--init_samples',       default=480,    type=int,   help='number of random inital training data')
        parser.add_argument('--batch_size',         default=128,    type=int,   help='Size of a training mini-batch.')
        parser.add_argument('--budget',             default=224,    type=int,   help='Our labeling budget')
        parser.add_argument('--selection_radius',   default=32,     type=int,   help='Selection radius')
        parser.add_argument('--reward_threshold',   default=0,      type=float, help='Reward threshold')
        parser.add_argument('--w2v',                action='store_true',        help='Use w2v embeddings')

    elif dataset == 'test':
        parser.add_argument("--hidden_size",        default=4,      type=int,   help="Size of hidden layer in deep RL")
        parser.add_argument("--episodes",           default=10000,  type=int,   help="number of episodes")
        parser.add_argument("--learning_rate_rl",   default=0.1,    type=float, help="learning rate")
        parser.add_argument('--budget',             default=50,     type=int,   help='Our labeling budget')
        parser.add_argument('--init_samples',       default=0,      type=int,   help='number of random inital training data')
        parser.add_argument('--num_epochs',         default=0,      type=int,   help='Number of training epochs.')
        parser.add_argument('--full_epochs',        default=0,      type=int,   help='Number of training epochs.')
        parser.add_argument("--reward_threshold",   default=0.6,    type=float, help="Reward threshold")
        parser.add_argument('--w2v',                action='store_true',        help='Use w2v embeddings')

    # Global params all datasets use
    parser.add_argument('--data_path',      default='/data/stud/jorgebjorn/data',       type=str,   help='Dir path to datasets')
    parser.add_argument('--vocab_path',     default='/data/stud/jorgebjorn/data/vocab/',type=str,   help='Dir path to saved vocabulary pickle files.')
    parser.add_argument('--batch_size_rl',  default=32,                                 type=int,   help='Size of a training mini-batch.')
    parser.add_argument('--device',         default=0,                                  type=int,   help='Which gpu to use')
    parser.add_argument('--log',            default='no',                               type=str,   help='Choose between: no, external, local, visdom')
    parser.add_argument('--agent',          default='dqn',                              type=str,   help='Type of reinforcement agent. (dqn | policy, actor_critic)')
    parser.add_argument('--c',              default='',                                 type=str,   help='Comment in logfile')
    parser.add_argument('--gamma',          default=0,                                  type=float, help='Discount factor')
    parser.add_argument('--load_model_name',default='',                                 type=str,   help='Path to existing RL model')

    parser.add_argument('--reset_train',    action='store_true', help='Ensure the training is always done in train mode (Not recommended).')
    parser.add_argument('--no_cuda',        action='store_true', help='Disable cuda')
    parser.add_argument('--reward_clip',    action='store_true', help='Give positive actions +1 and negative actions -1 reward')
    parser.add_argument('--train_shuffle',  action='store_true', help='Shuffle active train set every time')

    params = parser.parse_args(sys.argv[3:])
    params.actions = 2
    params.dataset = dataset
    params.logger_name = '{}_{}_{}_{}_{}_{}'.format(getpass.getuser(), datetime.datetime.now().strftime("%d-%m-%y_%H:%M"), dataset, params.agent, params.c, str(uuid.uuid4())[:4])
    params.external_log_url = 'http://logserver.duckdns.org:5000'

    if torch.cuda.is_available():
        torch.cuda.set_device(params.device)
    params.cuda = (not params.no_cuda) and torch.cuda.is_available()
    params.pid = os.getpid()

    for arg in vars(params):
        opt[arg] = vars(params)[arg]

    # sending tensorboard logs to external server
    if params.log == "external":
        global_logger["lg"] = external_logger()

    # saving tensorboard logs local
    elif params.log == "local":
        global_logger["lg"] = local_logger()

    elif params.log == 'visdom':
        global_logger["lg"] = visdom_logger()
        global_logger["lg"].parameters_summary()

    # no logging at all, for testing purposes.
    else:
        global_logger["lg"] = no_logger()

    container = importlib.import_module('datasets.{}'.format(dataset))
    model = container.model
    load_data = container.load_data
    train_data, dev_data, test_data = load_data()
    data["train"] = train_data
    data["dev"] = dev_data
    data["test"] = test_data

    if params.w2v:
        load_word2vec()

    from train import train
    train(model)
def train():
    with tf.device('/cpu:0'):
        train_text, train_y, train_e1, train_e2, train_pos1, train_pos2, train_rw, train_rw_pos, train_rw_cate = data_helpers.load_data_and_labels(
            FLAGS.train_path)
    with tf.device('/cpu:0'):
        test_text, test_y, test_e1, test_e2, test_pos1, test_pos2, test_rw, test_rw_pos, test_rw_cate = data_helpers.load_data_and_labels(
            FLAGS.test_path)

    #words = data_helpers.relation_words([train_between_e, test_between_e])
    #train_relation_words_between_entity = data_helpers.relation_words_between_entity(train_between_e, words)
    #test_relation_words_between_entity = data_helpers.relation_words_between_entity(test_between_e, words)

    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
    # dimension = MAX_SENTENCE_LENGTH
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    vocab_processor.fit(train_text + test_text)
    train_x = np.array(list(vocab_processor.transform(train_text)))
    test_x = np.array(list(vocab_processor.transform(test_text)))
    train_text = np.array(train_text)
    test_text = np.array(test_text)
    print("\nText Vocabulary Size: {:d}".format(
        len(vocab_processor.vocabulary_)))
    print("train_x = {0}".format(train_x.shape))
    print("train_y = {0}".format(train_y.shape))
    print("test_x = {0}".format(test_x.shape))
    print("test_y = {0}".format(test_y.shape))

    vocab_processor2 = tf.contrib.learn.preprocessing.VocabularyProcessor(6)
    vocab_processor2.fit(train_rw + test_rw)
    train_rw_x = np.array(list(vocab_processor2.transform(train_rw)))
    test_rw_x = np.array(list(vocab_processor2.transform(test_rw)))
    train_rw_text = np.array(train_rw)
    test_rw_text = np.array(test_rw)

    vocab_processor2pos = tf.contrib.learn.preprocessing.VocabularyProcessor(6)
    vocab_processor2pos.fit(train_rw_pos + test_rw_pos)
    train_rw_pos_x = np.array(list(
        vocab_processor2pos.transform(train_rw_pos)))
    test_rw_pos_x = np.array(list(vocab_processor2pos.transform(test_rw_pos)))
    train_rw_pos_text = np.array(train_rw_pos)
    test_rw_pos_text = np.array(test_rw_pos)

    # Example: pos1[3] = [-2 -1  0  1  2   3   4 999 999 999 ... 999]
    # [95 96 97 98 99 100 101 999 999 999 ... 999]
    # =>
    # [11 12 13 14 15  16  21  17  17  17 ...  17]
    # dimension = MAX_SENTENCE_LENGTH
    pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2)
    train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1)))
    train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2)))
    test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1)))
    test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2)))
    print("\nPosition Vocabulary Size: {:d}".format(
        len(pos_vocab_processor.vocabulary_)))
    print("train_p1 = {0}".format(train_p1.shape))
    print("test_p1 = {0}".format(test_p1.shape))
    print("")

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model = EntityAttentionLSTM(
                sequence_length=train_x.shape[1],
                rw_length=6,
                num_classes=train_y.shape[1],
                vocab_size=len(vocab_processor.vocabulary_),
                rw_vocab_size=len(vocab_processor2.vocabulary_),
                rw_pos_vocab_size=len(vocab_processor2pos.vocabulary_),
                embedding_size=FLAGS.embedding_size,
                pos_vocab_size=len(pos_vocab_processor.vocabulary_),
                pos_embedding_size=FLAGS.pos_embedding_size,
                hidden_size=FLAGS.hidden_size,
                num_heads=FLAGS.num_heads,
                attention_size=FLAGS.attention_size,
                use_elmo=(FLAGS.embeddings == 'elmo'),
                l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate,
                                                   FLAGS.decay_rate, 1e-6)
            gvs = optimizer.compute_gradients(model.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var)
                          for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs,
                                                 global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("\nWriting to {}\n".format(out_dir))

            # Logger
            logger = Logger(out_dir)

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", model.loss)
            acc_summary = tf.summary.scalar("accuracy", model.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))
            vocab_processor2.save(os.path.join(out_dir, "rw_vocab"))
            vocab_processor2pos.save(os.path.join(out_dir, "rw_pos_vocab"))
            pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            if FLAGS.embeddings == "word2vec":
                pretrain_W = utils.load_word2vec(
                    'resource/GoogleNews-vectors-negative300.bin',
                    FLAGS.embedding_size, vocab_processor)
                sess.run(model.W_text.assign(pretrain_W))
                print("Success to load pre-trained word2vec model!\n")
            elif FLAGS.embeddings == "glove100":
                pretrain_W = utils.load_glove('resource/glove.6B.100d.txt',
                                              FLAGS.embedding_size,
                                              vocab_processor)
                sess.run(model.W_text.assign(pretrain_W))
                print("Success to load pre-trained glove100 model!\n")
            elif FLAGS.embeddings == "glove300":
                pretrain_W = utils.load_glove('resource/glove.840B.300d.txt',
                                              FLAGS.embedding_size,
                                              vocab_processor)
                pretrain_rw_W = utils.load_glove(
                    'resource/glove.840B.300d.txt', FLAGS.embedding_size,
                    vocab_processor2)
                sess.run(model.W_text.assign(pretrain_W))
                sess.run(model.W_rw_text.assign(pretrain_rw_W))
                print("Success to load pre-trained glove300 model!\n")

            # Generate batches
            train_batches = data_helpers.batch_iter(
                list(
                    zip(train_x, train_y, train_text, train_e1, train_e2,
                        train_p1, train_p2, train_rw_x, train_rw_text,
                        train_rw_pos_x, train_rw_pos_text, train_rw_cate)),
                FLAGS.batch_size, FLAGS.num_epochs)
            # Training loop. For each batch...
            best_f1 = 0.0  # For save checkpoint(model)
            for train_batch in train_batches:
                train_bx, train_by, train_btxt, train_be1, train_be2, train_bp1, train_bp2, train_brw_x, train_brw_text, train_brw_pos_x, train_brw_pos_text, train_brw_cate = zip(
                    *train_batch)
                feed_dict = {
                    model.input_x: train_bx,
                    model.input_y: train_by,
                    model.input_text: train_btxt,
                    model.input_e1: train_be1,
                    model.input_e2: train_be2,
                    model.input_p1: train_bp1,
                    model.input_p2: train_bp2,
                    model.input_rw_x: train_brw_x,  ########
                    model.input_rw_text: train_brw_text,  ##########
                    model.input_rw_pos_x: train_brw_pos_x,  #######
                    model.input_rw_pos_text: train_brw_pos_text,  #######
                    model.input_rw_cate: train_brw_cate,  ###########
                    model.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob,
                    model.rnn_dropout_keep_prob: FLAGS.rnn_dropout_keep_prob,
                    model.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, model.loss,
                    model.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    logger.logging_train(step, loss, accuracy)

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    # Generate batches
                    test_batches = data_helpers.batch_iter(list(
                        zip(test_x, test_y, test_text, test_e1, test_e2,
                            test_p1, test_p2, test_rw_x, test_rw_text,
                            test_rw_pos_x, test_rw_pos_text, test_rw_cate)),
                                                           FLAGS.batch_size,
                                                           1,
                                                           shuffle=False)
                    # Training loop. For each batch...
                    losses = 0.0
                    accuracy = 0.0
                    predictions = []
                    iter_cnt = 0
                    for test_batch in test_batches:
                        test_bx, test_by, test_btxt, test_be1, test_be2, test_bp1, test_bp2, test_brw_x, test_brw_text, test_brw_pos_x, test_brw_pos_text, test_brw_cate = zip(
                            *test_batch)
                        feed_dict = {
                            model.input_x: test_bx,
                            model.input_y: test_by,
                            model.input_text: test_btxt,
                            model.input_e1: test_be1,
                            model.input_e2: test_be2,
                            model.input_p1: test_bp1,
                            model.input_p2: test_bp2,
                            model.input_rw_x: test_brw_x,  ########
                            model.input_rw_text: test_brw_text,  ##########
                            model.input_rw_pos_x: test_brw_pos_x,  #######
                            model.input_rw_pos_text:
                            test_brw_pos_text,  #######
                            model.input_rw_cate: test_brw_cate,  #########
                            model.emb_dropout_keep_prob: 1.0,
                            model.rnn_dropout_keep_prob: 1.0,
                            model.dropout_keep_prob: 1.0
                        }
                        loss, acc, pred = sess.run(
                            [model.loss, model.accuracy, model.predictions],
                            feed_dict)
                        losses += loss
                        accuracy += acc
                        predictions += pred.tolist()
                        iter_cnt += 1
                    losses /= iter_cnt
                    accuracy /= iter_cnt
                    predictions = np.array(predictions, dtype='int')

                    logger.logging_eval(step, loss, accuracy, predictions)

                    # Model checkpoint
                    if best_f1 < logger.best_f1:
                        best_f1 = logger.best_f1
                        path = saver.save(sess,
                                          checkpoint_prefix +
                                          "-{:.3g}".format(best_f1),
                                          global_step=step)
                        print("Saved model checkpoint to {}\n".format(path))
Beispiel #17
0
    test = utils.padding(test, token2idx, tag2idx, maxlen)

    train = utils._to_tensor(train, tf.int32)
    dev = utils._to_tensor(dev, tf.int32)
    test = utils._to_tensor(test, tf.int32)

    # to batch
    train_ds = tf.data.Dataset.from_tensor_slices(train).shuffle(10000).batch(
        batch_size)
    dev_ds = tf.data.Dataset.from_tensor_slices(dev).shuffle(2000).batch(
        batch_size * 2)
    test_ds = tf.data.Dataset.from_tensor_slices(test).shuffle(2000).batch(
        batch_size * 2)

    embedding_pretrained = utils.load_word2vec(
        'data/embeddings/wiki_100.utf8', token2idx, embed_dim,
        'data/embeddings/embed_mat.npy')

    model = LSTM_CRF(len(token2idx), embed_dim, maxlen, len(tag2idx),
                     rnn_hiden_size, embedding_pretrained)
    optimizer = tf.keras.optimizers.Adam(lr=0.003)

    run.training(model, train_ds, dev_ds, epochs, optimizer)
    run.evaluate(model, test_ds, data_name="测试集")
    # # # save model
    # # print("\nsave model...")
    # # model.save_weights('model saved/')
    #
    # # load model
    # print("load model...")
    # model.load_weights('model saved/')
            label = [label0, label1][label]
            text = " ".join(tokens)
            if len(text) >= 125:
                text = text[:125] + "..."
            meta_file.write(f"{label}\t{text}\n")
            embedding_matrix.append(avg_embed(acc))

    return list(map(avg_embed, vals))


def norm_dist(u, v):
    return np.linalg.norm(u / np.linalg.norm(u) - v / np.linalg.norm(v))


if __name__ == "__main__":
    w2v = utils.load_word2vec()
    avg_embed = lambda vs: sum(vs) / len(vs)

    if not os.path.exists("projector"):
        os.makedirs("projector")

    stormfront_vals = [[], []]
    twitter_vals = [[], []]
    with open(os.path.join("projector", "metadata.tsv"), "w") as f:
        f.write("label\tcontent\n")

        # for tokens, label in utils.stormfront_gen():
        #     acc = []
        #     for token in tokens:
        #         if token in w2v:
        #             stormfront_vals[label].append(w2v[token])
Beispiel #19
0
def train():
    with tf.device('/cpu:0'):
        x_text, y, pos1, pos2, x_text_clean, sentence_len = data_helpers.load_data_and_labels(
            FLAGS.train_path)

    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
    # dimension = FLAGS.max_sentence_length
    # print("text:",x_text)
    text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    x = np.array(list(text_vocab_processor.fit_transform(x_text)))  #token
    # pretrain_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_dim, text_vocab_processor)
    # print("pretrain_w:",pretrain_W)
    # print(pretrain_W.shape) #(19151,300)
    print("Text Vocabulary Size: {:d}".format(
        len(text_vocab_processor.vocabulary_)))
    # print("vocabulary:", text_vocab_processor.vocabulary_._reverse_mapping)
    # with open("vocabulary.txt","w",encoding="utf-8") as f:
    #     f.write(str(x))
    print("x = {0}".format(x.shape))  #(8000,90)
    print("y = {0}".format(y.shape))  #(8000,19)
    print("")

    # Example: pos1[3] = [-2 -1  0  1  2   3   4 999 999 999 ... 999]
    # [95 96 97 98 99 100 101 999 999 999 ... 999]
    # =>
    # [11 12 13 14 15  16  21  17  17  17 ...  17]
    # dimension = MAX_SENTENCE_LENGTH
    pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    pos_vocab_processor.fit(pos1 + pos2)  #fit
    # print("pos vocab position:", pos_vocab_processor)
    p1 = np.array(list(pos_vocab_processor.transform(pos1)))  #tokens
    # print("p1:", p1)
    p2 = np.array(list(pos_vocab_processor.transform(pos2)))
    print("Position Vocabulary Size: {:d}".format(
        len(pos_vocab_processor.vocabulary_)))
    # with open("position.txt", "w", encoding="utf-8") as f:
    #         f.write(str(x))
    print("position_1 = {0}".format(p1.shape))  #(8000,90)
    print("position_2 = {0}".format(p2.shape))  #(8000,90)
    print("")

    # Randomly shuffle data to split into train and test(dev)
    np.random.seed(10)

    shuffle_indices = np.random.permutation(np.arange(len(y)))  #len(y)=8000
    x_shuffled = x[shuffle_indices]
    p1_shuffled = p1[shuffle_indices]
    p2_shuffled = p2[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled)

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y)))  #x_train=7200, x_dev =800
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[
        dev_sample_index:]
    p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))
    # print(x_train)
    # print(np.array(x_train))
    # print(x_dev)
    # print(np.array(x_dev))

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=x_train.shape[1],  #90
                num_classes=y_train.shape[1],  #19
                text_vocab_size=len(text_vocab_processor.vocabulary_),  #19151
                text_embedding_size=FLAGS.text_embedding_size,  #300
                pos_vocab_size=len(pos_vocab_processor.vocabulary_),  #162
                pos_embedding_size=FLAGS.pos_embedding_dim,  #50
                filter_sizes=list(map(
                    int, FLAGS.filter_sizes.split(","))),  #2,3,4,5
                num_filters=FLAGS.num_filters,  #128
                l2_reg_lambda=FLAGS.l2_reg_lambda)  #1e-5

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate,
                                                   FLAGS.decay_rate, 1e-6)
            gvs = optimizer.compute_gradients(cnn.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var)
                          for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs,
                                                 global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            text_vocab_processor.save(os.path.join(out_dir, "text_vocab"))
            pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            # FLAGS._sess =sess
            print("shape:", x_train.shape)
            print(y_train.shape)
            # Pre-trained word2vec
            if FLAGS.embedding_path:
                pretrain_W = utils.load_word2vec(FLAGS.embedding_path,
                                                 FLAGS.text_embedding_size,
                                                 text_vocab_processor)
                sess.run(cnn.W_text.assign(pretrain_W))
                print("Success to load pre-trained word2vec model!\n")

            # Generate batches
            batches = data_helpers.batch_iter(
                list(zip(x_train, p1_train, p2_train, y_train)),
                FLAGS.batch_size, FLAGS.num_epochs)
            print(batches)
            # Training loop. For each batch...
            best_f1 = 0.0  # For save checkpoint(model)
            for batch in batches:
                x_batch, p1_batch, p2_batch, y_batch = zip(*batch)
                # Train
                feed_dict = {
                    cnn.input_text: x_batch,
                    cnn.input_p1: p1_batch,
                    cnn.input_p2: p2_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                # print(len(x_batch))
                # print(len(y_batch))
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    feed_dict = {
                        cnn.input_text: x_dev,
                        cnn.input_p1: p1_dev,
                        cnn.input_p2: p2_dev,
                        cnn.input_y: y_dev,
                        cnn.dropout_keep_prob: 1.0
                    }
                    summaries, loss, accuracy, predictions, text_expand_shape, pos_shape, embedding_size_shape, embedding_shape, text_shape = sess.run(
                        [
                            dev_summary_op, cnn.loss, cnn.accuracy,
                            cnn.predictions, cnn.text_expand_shape,
                            cnn.pos_expand_shape, cnn.embedding_size_shape,
                            cnn.embedd_shape, cnn.text_shape
                        ], feed_dict)
                    dev_summary_writer.add_summary(summaries, step)

                    time_str = datetime.datetime.now().isoformat()
                    f1 = f1_score(np.argmax(y_dev, axis=1),
                                  predictions,
                                  labels=np.array(range(1, 19)),
                                  average="macro")
                    precision = tf.metrics.precision(np.argmax(y_dev, axis=1),
                                                     predictions,
                                                     weights=1)
                    recall = tf.metrics.recall(np.argmax(y_dev, axis=1),
                                               predictions,
                                               weights=1)

                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))
                    print(
                        "[UNOFFICIAL] (2*9+1)-Way Macro-Average F1 Score (excluding Other): {:g}\n"
                        .format(f1))
                    # print("text_embedded_shape:", text_shape)
                    # print("text_embedd_extend:", text_expand_shape)
                    # print("pos-embedd_extend:", pos_shape)
                    # print("embedding_size:", embedding_shape)
                    # print("embedding_size_shape", embedding_size_shape)
                    print("predit:", predictions)
                    print(predictions.shape)
                    print("y_dev:", y_dev)
                    print(y_dev.shape)

                    # Model checkpoint
                    if best_f1 < f1:
                        best_f1 = f1
                        path = saver.save(sess,
                                          checkpoint_prefix +
                                          "-{:.3g}".format(best_f1),
                                          global_step=step)
                        print("Saved model checkpoint to {}\n".format(path))
Beispiel #20
0
def main(gpu, path_corpus, path_config, path_word2vec):
    MAX_EPOCH = 50
    EVAL = 200
    MAX_LENGTH = 70
    COUNTS_CACHE = "./cache/counts.pkl"
    
    config = utils.Config(path_config)
    word_dim = config.getint("word_dim") 
    state_dim = config.getint("state_dim")
    grad_clip = config.getfloat("grad_clip")
    weight_decay = config.getfloat("weight_decay")
    batch_size = config.getint("batch_size")
    sample_size = config.getint("sample_size")
    
    print "[info] CORPUS: %s" % path_corpus
    print "[info] CONFIG: %s" % path_config
    print "[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec
    print "[info] WORD DIM: %d" % word_dim
    print "[info] STATE DIM: %d" % state_dim
    print "[info] GRADIENT CLIPPING: %f" % grad_clip
    print "[info] WEIGHT DECAY: %f" % weight_decay
    print "[info] BATCH SIZE: %d" % batch_size

    path_save_head = os.path.join(config.getpath("snapshot"),
            "rnnlm.%s.%s" % (
                os.path.basename(path_corpus),
                os.path.splitext(os.path.basename(path_config))[0]))
    print "[info] SNAPSHOT: %s" % path_save_head
    
    sents_train, sents_val, vocab, ivocab = \
            utils.load_corpus(path_corpus=path_corpus, max_length=MAX_LENGTH)

    #counts = None

    #print("[info] Load word counter")
    #if os.path.exists(COUNTS_CACHE):
    #    print("[info] Found cache of counter")
    #    counts = pickle.load(open(COUNTS_CACHE, "rb"))

    #    if len(counts) != len(vocab):
    #        counts = None

    #if counts is None:
    #    counts = Counter()

    #    for sent in list(sents_train) + list(sents_val):
    #        counts += Counter(sent)

    #    pickle.dump(counts, open(COUNTS_CACHE, "wb"))

    #cs = [counts[w] for w in range(len(counts))]

    if path_word2vec is not None:
        word2vec = utils.load_word2vec(path_word2vec, word_dim)
        initialW = utils.create_word_embeddings(vocab, word2vec, dim=word_dim, scale=0.001)
    else:
        initialW = None

    cuda.get_device(gpu).use()

    model = models.CXT_BLSTM(
            vocab_size=len(vocab),
            word_dim=word_dim,
            state_dim=state_dim,
            initialW=initialW,
            EOS_ID=vocab["<EOS>"])

    model.to_gpu(gpu)

    opt = optimizers.SMORMS3()
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(grad_clip))
    opt.add_hook(chainer.optimizer.WeightDecay(weight_decay))

    # sampler = utils.RandomSampler(cs, sample_size)

    #print "[info] Evaluating on the validation sentences ..."
    #loss_data = evaluate(model, sents_val, ivocab, word_dim, sampler)
    #print "[validation] iter=0, epoch=0, loss=%f" \
    #    % (loss_data)
    
    it = 0
    n_train = len(sents_train)
    vocab_size = model.vocab_size

    for epoch in xrange(1, MAX_EPOCH+1):
        perm = np.random.permutation(n_train)
        for data_i in xrange(0, n_train, batch_size):
            if data_i + batch_size > n_train:
                break
            words = sents_train[perm[data_i:data_i+batch_size]]
            xs, ms = utils.make_batch(words, train=True, tail=False, mask=True)

            ys = model.forward(xs=xs, ms=ms, train=True)
            
            words_without_edge = [w[1:-1] for w in words]
            xs_without_edge, ms_without_edge = utils.make_batch(words_without_edge, train=True, tail=False, mask=True)

            masked_ys = []
            for y, m in zip(ys, ms_without_edge):
                m_ext = F.broadcast_to(F.reshape(m, (batch_size, 1)), (batch_size, vocab_size))
                masked_ys.append(y*m_ext)

            #ts = model.embed_words(xs_without_edge, ms_without_edge, train=True) # BOS, EOSは除く

            #  T : バッチの中の最大長
            #  N : バッチサイズ
            # |D|: word_dim
            ys = F.concat(masked_ys, axis=0) # (TN, |V|)
            ts = F.concat(xs_without_edge, axis=0) # (TN, |D|)

            ys = F.reshape(ys, (-1, vocab_size)) # (TN, |D|)
            ts = F.reshape(ts, (-1,)) # (TN,)

            loss = F.softmax_cross_entropy(ys, ts)
            acc = F.accuracy(ys, ts, ignore_label=-1)
        
            model.zerograds()
            loss.backward()
            loss.unchain_backward()
            opt.update()
            it += 1

            loss_data = float(cuda.to_cpu(loss.data))
            perp = math.exp(loss_data)
            acc_data = float(cuda.to_cpu(acc.data))

            print "[training] iter=%d, epoch=%d (%d/%d=%.03f%%), perplexity=%f, accuracy=%.2f%%" \
                    % (it, epoch, data_i+batch_size, n_train,
                        float(data_i+batch_size)/n_train*100,
                        perp, acc_data*100)

            if it % EVAL == 0:
                print "[info] Evaluating on the validation sentences ..."
                loss_data, acc_data = evaluate(model, sents_val, ivocab, word_dim)
                perp = math.exp(loss_data)
                print "[validation] iter=%d, epoch=%d, perplexity=%f, accuracy=%.2f%%" \
                        % (it, epoch, perp, acc_data*100)

                serializers.save_npz(path_save_head + ".iter_%d.epoch_%d.model" % (it, epoch),
                        model)
                # utils.save_word2vec(path_save_head + ".iter_%d.epoch_%d.vectors.txt" % (it, epoch),
                #         utils.extract_word2vec(model, vocab))
                print "[info] Saved."

    print "[info] Done."
Beispiel #21
0
        return support_set, support_label, query_set, query_label

    def next_batch(self, B, N, K, Q):
        support_set = []
        support_label = []
        query_set = []
        label = []

        for one in range(B):
            cur_support, cur_support_label, cur_query, cur_label = self.next_one(
                N, K, Q)
            support_set.append(cur_support)
            support_label.append(cur_support_label)
            query_set.append(cur_query)
            label.append(cur_label)

        support = np.stack(support_set, 0)
        support_label = np.stack(support_label, 0)
        query = np.stack(query_set, 0)
        label = np.stack(label, 0)

        return support, support_label, query, label


if __name__ == "__main__":
    import utils
    vocab, embedding = utils.load_word2vec('data/tencent_embedding.txt')
    data_loader = DataLoader('data/sample_data.json', vocab)

    data_loader.next_batch(4, 5, 5, 5)
Beispiel #22
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', dest='epochs', type=int, default=10)
    parser.add_argument('--output', dest='output', type=str, default='./nn_result.csv')
    parser.add_argument('--log', dest='log', type=str)
    parser.add_argument('--w2v', dest='w2v', type=str)
    parser.add_argument('--freeze', dest='freeze', action='store_true', default=False)
    parser.add_argument('--model', dest='model', type=str, choices=['CNN', 'RNN'], default='CNN')
    args = parser.parse_args()
    
    if args.log is not None:
        handler = logging.FileHandler(args.log)
        handler.setLevel(logging.INFO)
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    logger.info(args)
    logger.info('device = {}'.format(device))

    tsv_train = pd.read_csv('./data/train.tsv', sep='\t')
    num = len(tsv_train)
    num_val = int(num * 0.1)
    num_train = num - num_val
    
    tsv_val = tsv_train[num_train:]
    tsv_train = tsv_train[:num_train]
    
    x_train = tsv_train['Phrase'].values
    y_train = tsv_train['Sentiment'].values
    x_val = tsv_val['Phrase'].values
    y_val = tsv_val['Sentiment'].values

    if args.w2v is None:
        tokenizer = Tokenizer(text=x_train)
    else:
        from utils import load_word2vec
        w2v = load_word2vec(args.w2v)
        words = list(w2v.keys())
        tokenizer = Tokenizer(words=words)
        vecs = list(w2v.values())
        vecs.insert(0, [.0] * embedding_dim)
        vecs.insert(1, [.0] * embedding_dim)
    
    vocab_size = len(tokenizer.vocabulary_)
    logger.info('vocab_size = {}'.format(vocab_size))
    
    train_dataset = MyDataset(x_train, y_train, seq_length, tokenizer)
    train_dl = DataLoader(train_dataset, shuffle=True, batch_size=128)
    
    val_dataset = MyDataset(x_val, y_val, seq_length, tokenizer)
    val_dl = DataLoader(val_dataset, shuffle=False, batch_size=128)
    
    net = TextCNN(seq_length, vocab_size, embedding_dim, num_classes) if args.model == 'CNN' \
        else TextRNN(seq_length, vocab_size, embedding_dim, num_classes)

    if args.w2v is not None:
        net.embed.from_pretrained(torch.tensor(vecs), freeze=args.freeze)
    net.to(device)
    
    train(net, train_dl, val_dl, args.epochs, device)
    
    tsv_test = pd.read_csv('./data/test.tsv', sep='\t')
    x_test = tsv_test['Phrase'].values
    test_dataset = MyDataset(x_test, seq_length=seq_length, tokenizer=tokenizer)
    test_dl = DataLoader(test_dataset, shuffle=False, batch_size=128)
    y_test = []
    for data in test_dl:
        x = data.to(device)
        out = net(x)
        y_test += out.argmax(1).cpu().tolist()
        
    tsv_test['Sentiment'] = y_test
    tsv_test[['PhraseId', 'Sentiment']].to_csv(args.output, index=False)
Beispiel #23
0
def main(gpu, path_corpus, path_config, path_word2vec):
    MAX_EPOCH = 50
    EVAL = 200
    MAX_LENGTH = 70

    config = utils.Config(path_config)
    model_name = config.getstr("model")
    word_dim = config.getint("word_dim")
    state_dim = config.getint("state_dim")
    grad_clip = config.getfloat("grad_clip")
    weight_decay = config.getfloat("weight_decay")
    batch_size = config.getint("batch_size")

    print "[info] CORPUS: %s" % path_corpus
    print "[info] CONFIG: %s" % path_config
    print "[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec
    print "[info] MODEL: %s" % model_name
    print "[info] WORD DIM: %d" % word_dim
    print "[info] STATE DIM: %d" % state_dim
    print "[info] GRADIENT CLIPPING: %f" % grad_clip
    print "[info] WEIGHT DECAY: %f" % weight_decay
    print "[info] BATCH SIZE: %d" % batch_size

    path_save_head = os.path.join(
        config.getpath("snapshot"),
        "rnnlm.%s.%s" % (os.path.basename(path_corpus),
                         os.path.splitext(os.path.basename(path_config))[0]))
    print "[info] SNAPSHOT: %s" % path_save_head

    sents_train, sents_val, vocab, ivocab = \
            utils.load_corpus(path_corpus=path_corpus, max_length=MAX_LENGTH)

    if path_word2vec is not None:
        word2vec = utils.load_word2vec(path_word2vec, word_dim)
        initialW = utils.create_word_embeddings(vocab,
                                                word2vec,
                                                dim=word_dim,
                                                scale=0.001)
    else:
        initialW = None

    cuda.get_device(gpu).use()
    if model_name == "rnn":
        model = models.RNN(vocab_size=len(vocab),
                           word_dim=word_dim,
                           state_dim=state_dim,
                           initialW=initialW,
                           EOS_ID=vocab["<EOS>"])
    elif model_name == "lstm":
        model = models.LSTM(vocab_size=len(vocab),
                            word_dim=word_dim,
                            state_dim=state_dim,
                            initialW=initialW,
                            EOS_ID=vocab["<EOS>"])
    elif model_name == "gru":
        model = models.GRU(vocab_size=len(vocab),
                           word_dim=word_dim,
                           state_dim=state_dim,
                           initialW=initialW,
                           EOS_ID=vocab["<EOS>"])
    elif model_name == "bd_lstm":
        model = models.BD_LSTM(vocab_size=len(vocab),
                               word_dim=word_dim,
                               state_dim=state_dim,
                               initialW=initialW,
                               EOS_ID=vocab["<EOS>"])
    else:
        print "[error] Unknown model name: %s" % model_name
        sys.exit(-1)
    model.to_gpu(gpu)

    opt = optimizers.SMORMS3()
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(grad_clip))
    opt.add_hook(chainer.optimizer.WeightDecay(weight_decay))

    print "[info] Evaluating on the validation sentences ..."
    loss_data, acc_data = evaluate(model, model_name, sents_val, ivocab)
    perp = math.exp(loss_data)
    print "[validation] iter=0, epoch=0, perplexity=%f, accuracy=%.2f%%" \
        % (perp, acc_data*100)

    it = 0
    n_train = len(sents_train)
    vocab_size = model.vocab_size
    for epoch in xrange(1, MAX_EPOCH + 1):
        perm = np.random.permutation(n_train)
        for data_i in xrange(0, n_train, batch_size):
            if data_i + batch_size > n_train:
                break
            words = sents_train[perm[data_i:data_i + batch_size]]

            if model_name == "bd_lstm":
                xs, ms = utils.make_batch(words,
                                          train=True,
                                          tail=False,
                                          mask=True)
                ys = model.forward(xs=xs, ms=ms, train=True)
            else:
                xs = utils.make_batch(words, train=True, tail=False)
                ys = model.forward(ts=xs, train=True)

            ys = F.concat(ys, axis=0)
            ts = F.concat(xs, axis=0)
            ys = F.reshape(ys, (-1, vocab_size))  # (TN, |V|)
            ts = F.reshape(ts, (-1, ))  # (TN,)

            loss = F.softmax_cross_entropy(ys, ts)
            acc = F.accuracy(ys, ts, ignore_label=-1)

            model.zerograds()
            loss.backward()
            loss.unchain_backward()
            opt.update()
            it += 1

            loss_data = float(cuda.to_cpu(loss.data))
            perp = math.exp(loss_data)
            acc_data = float(cuda.to_cpu(acc.data))
            print "[training] iter=%d, epoch=%d (%d/%d=%.03f%%), perplexity=%f, accuracy=%.2f%%" \
                    % (it, epoch, data_i+batch_size, n_train,
                        float(data_i+batch_size)/n_train*100,
                        perp, acc_data*100)

            if it % EVAL == 0:
                print "[info] Evaluating on the validation sentences ..."
                loss_data, acc_data = evaluate(model, model_name, sents_val,
                                               ivocab)
                perp = math.exp(loss_data)
                print "[validation] iter=%d, epoch=%d, perplexity=%f, accuracy=%.2f%%" \
                        % (it, epoch, perp, acc_data*100)

                serializers.save_npz(
                    path_save_head + ".iter_%d.epoch_%d.model" % (it, epoch),
                    model)
                utils.save_word2vec(
                    path_save_head + ".iter_%d.epoch_%d.vectors.txt" %
                    (it, epoch), utils.extract_word2vec(model, vocab))
                print "[info] Saved."

    print "[info] Done."
Beispiel #24
0
def main():
    parser = argparse.ArgumentParser(description="-----[CNN-classifier]-----")
    parser.add_argument("--similarity",
                        default=0.0,
                        type=float,
                        help="similarity threshold")
    parser.add_argument(
        "--similarity_representation",
        default="W2V",
        help=
        "similarity representation. Available methods: CNN, AUTOENCODER, W2V")
    parser.add_argument(
        "--mode",
        default="train",
        help="train: train (with test) a model / test: test saved models")
    parser.add_argument(
        "--model",
        default="cnn",
        help="Type of model to use. Default: CNN. Available models: CNN, RNN")
    parser.add_argument("--embedding",
                        default="w2v",
                        help="available embedings: random, w2v")
    parser.add_argument("--dataset",
                        default="MR",
                        help="available datasets: MR, TREC")
    parser.add_argument("--encoder",
                        default=None,
                        help="Path to encoder model file")
    parser.add_argument("--decoder",
                        default=None,
                        help="Path to decoder model file")
    parser.add_argument('--batch-size',
                        type=int,
                        default=32,
                        help='batch size for training [default: 32]')
    parser.add_argument(
        '--selection-size',
        type=int,
        default=32,
        help='selection size for selection function [default: 32]')
    parser.add_argument("--save_model",
                        default="F",
                        help="whether saving model or not (T/F)")
    parser.add_argument("--early_stopping",
                        default="F",
                        help="whether to apply early stopping(T/F)")
    parser.add_argument("--epoch",
                        default=100,
                        type=int,
                        help="number of max epoch")
    parser.add_argument("--learning_rate",
                        default=0.1,
                        type=float,
                        help="learning rate")
    parser.add_argument("--dropout_embed",
                        default=0.2,
                        type=float,
                        help="Dropout embed probability. Default: 0.2")
    parser.add_argument("--dropout_model",
                        default=0.4,
                        type=float,
                        help="Dropout model probability. Default: 0.4")
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='Cuda device to run on')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disable the gpu')
    parser.add_argument(
        "--scorefn",
        default="entropy",
        help="available scoring functions: entropy, random, egl")
    parser.add_argument('--average',
                        type=int,
                        default=1,
                        help='Number of runs to average [default: 1]')
    parser.add_argument('--hnodes',
                        type=int,
                        default=256,
                        help='Number of nodes in the hidden layer(s)')
    parser.add_argument('--hlayers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=1e-5,
                        help='Value of weight_decay')
    parser.add_argument('--no-log',
                        action='store_true',
                        default=False,
                        help='Disable logging')
    parser.add_argument('--data_path',
                        default='/data/stud/jorgebjorn/data',
                        type=str,
                        help='Dir path to datasets')
    parser.add_argument('--c', default='', type=str, help='Comment to run ')

    options = parser.parse_args()

    params["DATA_PATH"] = options.data_path  #TODO rewrite?

    getattr(utils, "read_{}".format(options.dataset))()

    data["vocab"] = sorted(
        list(
            set([
                w for sent in data["train_x"] + data["dev_x"] + data["test_x"]
                for w in sent
            ])))
    data["classes"] = sorted(list(set(data["train_y"])))
    data["word_to_idx"] = {w: i for i, w in enumerate(data["vocab"])}

    params_local = {
        "SIMILARITY_THRESHOLD":
        options.similarity,
        "SIMILARITY_REPRESENTATION":
        options.similarity_representation,
        "DATA_PATH":
        options.data_path,
        "MODEL":
        options.model,
        "EMBEDDING":
        options.embedding,
        "DATASET":
        options.dataset,
        "SAVE_MODEL":
        bool(options.save_model == "T"),
        "EARLY_STOPPING":
        bool(options.early_stopping == "T"),
        "EPOCH":
        options.epoch,
        "LEARNING_RATE":
        options.learning_rate,
        "MAX_SENT_LEN":
        max([
            len(sent)
            for sent in data["train_x"] + data["dev_x"] + data["test_x"]
        ]),
        "SELECTION_SIZE":
        options.selection_size,
        "BATCH_SIZE":
        options.batch_size,
        "WORD_DIM":
        300,
        "VOCAB_SIZE":
        len(data["vocab"]),
        "CLASS_SIZE":
        len(data["classes"]),
        "FILTERS": [3, 4, 5],
        "FILTER_NUM": [100, 100, 100],
        "DROPOUT_EMBED":
        options.dropout_embed,
        "DROPOUT_MODEL":
        options.dropout_model,
        "DEVICE":
        options.device,
        "NO_CUDA":
        options.no_cuda,
        "SCORE_FN":
        options.scorefn,
        "N_AVERAGE":
        options.average,
        "HIDDEN_SIZE":
        options.hnodes,
        "HIDDEN_LAYERS":
        options.hlayers,
        "WEIGHT_DECAY":
        options.weight_decay,
        "LOG":
        not options.no_log,
        "ENCODER":
        options.encoder,
        "DECODER":
        options.decoder,
        "C":
        options.c
    }

    for key in params_local:
        params[key] = params_local[key]

    if params["LOG"]:
        logger_name = 'SS/{}_{}_{}_{}_{}'.format(
            getpass.getuser(),
            datetime.datetime.now().strftime("%d-%m-%y_%H:%M"),
            options.dataset, params["C"],
            str(uuid.uuid4())[:4])
        global_logger["lg"] = VisdomLogger(
            logger_name, "{}_{}".format(params["SIMILARITY_THRESHOLD"],
                                        params["SIMILARITY_REPRESENTATION"]))
        # global_logger["lg"].parameters_summary()
        print("visdom logger OK")
        # quit()

    params["CUDA"] = (not params["NO_CUDA"]) and torch.cuda.is_available()
    del params["NO_CUDA"]

    if params["CUDA"]:
        torch.cuda.set_device(params["DEVICE"])

    if params["EMBEDDING"] == "w2v":
        utils.load_word2vec()

    encoder = rnnae.EncoderRNN()
    # decoder = rnnae.DecoderRNN()
    decoder = rnnae.AttnDecoderRNN()
    feature_extractor = CNN2()

    if params["ENCODER"] != None:
        print("Loading encoder")
        encoder.load_state_dict(torch.load(params["ENCODER"]))

    if params["DECODER"] != None:
        print("Loading decoder")
        decoder.load_state_dict(torch.load(params["DECODER"]))

    if params["CUDA"]:
        encoder, decoder, feature_extractor = encoder.cuda(), decoder.cuda(
        ), feature_extractor.cuda()

    models["ENCODER"] = encoder
    models["DECODER"] = decoder
    models["FEATURE_EXTRACTOR"] = feature_extractor

    print("=" * 20 + "INFORMATION" + "=" * 20)
    for key, value in params.items():
        print("{}: {}".format(key.upper(), value))

    if params["EMBEDDING"] == "random" and params["SIMILARITY_THRESHOLD"] > 0:
        print("********** WARNING *********")
        print("Random embedding makes similarity threshold have no effect. \n")

    print("=" * 20 + "TRAINING STARTED" + "=" * 20)
    train.active_train()
    print("=" * 20 + "TRAINING FINISHED" + "=" * 20)
def train():
    with tf.device('/cpu:0'):
        train_text, train_y, train_pos1, train_pos2, train_x_text_clean, train_sentence_len = data_helpers.load_data_and_labels(
            FLAGS.train_path)
    with tf.device('/cpu:0'):
        test_text, test_y, test_pos1, test_pos2, test_x_text_clean, test_sentence_len = data_helpers.load_data_and_labels(
            FLAGS.test_path)
    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
    # dimension = FLAGS.max_sentence_length
    # print("text:",x_text)
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    vocab_processor.fit(train_text + test_text)
    train_x = np.array(list(vocab_processor.transform(train_text)))
    test_x = np.array(list(vocab_processor.transform(test_text)))
    train_text = np.array(train_text)
    print("train_text", train_text[0:2])
    test_text = np.array(test_text)
    print("\nText Vocabulary Size: {:d}".format(
        len(vocab_processor.vocabulary_)))
    print("train_x = {0}".format(train_x.shape))  # (8000,90)
    print("train_y = {0}".format(train_y.shape))  # (8000,19)
    print("test_x = {0}".format(test_x.shape))  # (2717, 90)
    print("test_y = {0}".format(test_y.shape))  # (2717,19)

    # Example: pos1[3] = [-2 -1  0  1  2   3   4 999 999 999 ... 999]
    # [95 96 97 98 99 100 101 999 999 999 ... 999]
    # =>
    # [11 12 13 14 15  16  21  17  17  17 ...  17]
    # dimension = MAX_SENTENCE_LENGTH
    pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2)
    train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1)))
    train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2)))
    test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1)))
    test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2)))
    print("\nPosition Vocabulary Size: {:d}".format(
        len(pos_vocab_processor.vocabulary_)))
    print("train_p1 = {0}".format(train_p1.shape))  # (8000, 90)
    print("test_p1 = {0}".format(test_p1.shape))  # (2717, 90)
    print("")

    # Randomly shuffle data to split into train and test(dev)
    # np.random.seed(10)
    #
    # shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000
    # x_shuffled = x[shuffle_indices]
    # p1_shuffled = p1[shuffle_indices]
    # p2_shuffled = p2[shuffle_indices]
    # y_shuffled = y[shuffle_indices]
    # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled)

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800
    # x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    # p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[dev_sample_index:]
    # p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[dev_sample_index:]
    # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    # print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))
    # print(x_train)
    # print(np.array(x_train))
    # print(x_dev)
    # print(np.array(x_dev))

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=FLAGS.max_sentence_length,  #90
                num_classes=train_y.shape[1],  #19
                text_vocab_size=len(vocab_processor.vocabulary_),  #19151
                text_embedding_size=FLAGS.text_embedding_size,  #300
                pos_vocab_size=len(pos_vocab_processor.vocabulary_),  #162
                pos_embedding_size=FLAGS.pos_embedding_dim,  #50
                filter_sizes=list(map(
                    int, FLAGS.filter_sizes.split(","))),  #2,3,4,5
                num_filters=FLAGS.num_filters,  #128
                l2_reg_lambda=FLAGS.l2_reg_lambda,  #1e-5
                use_elmo=(FLAGS.embeddings == 'elmo'))

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate,
                                                   FLAGS.decay_rate, 1e-6)
            gvs = optimizer.compute_gradients(cnn.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var)
                          for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs,
                                                 global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("\nWriting to {}\n".format(out_dir))

            # Logger
            logger = Logger(out_dir)

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))
            pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            if FLAGS.embeddings == "word2vec":
                pretrain_W = utils.load_word2vec(
                    'resource/GoogleNews-vectors-negative300.bin',
                    FLAGS.embedding_size, vocab_processor)
                sess.run(cnn.W_text.assign(pretrain_W))
                print("Success to load pre-trained word2vec model!\n")
            elif FLAGS.embeddings == "glove100":
                pretrain_W = utils.load_glove('resource/glove.6B.100d.txt',
                                              FLAGS.embedding_size,
                                              vocab_processor)
                sess.run(cnn.W_text.assign(pretrain_W))
                print("Success to load pre-trained glove100 model!\n")
            elif FLAGS.embeddings == "glove300":
                pretrain_W = utils.load_glove('resource/glove.840B.300d.txt',
                                              FLAGS.embedding_size,
                                              vocab_processor)
                sess.run(cnn.W_text.assign(pretrain_W))
                print("Success to load pre-trained glove300 model!\n")

            # Generate batches
            train_batches = data_helpers.batch_iter(
                list(zip(train_x, train_y, train_text, train_p1, train_p2)),
                FLAGS.batch_size, FLAGS.num_epochs)
            # Training loop. For each batch...
            best_f1 = 0.0  # For save checkpoint(model)
            for train_batch in train_batches:
                train_bx, train_by, train_btxt, train_bp1, train_bp2 = zip(
                    *train_batch)
                # print("train_bxt",list(train_btxt)[:2])
                # print(np.array(train_be1).shape) #(20, )
                # print(train_be1)
                feed_dict = {
                    cnn.input_text: train_bx,
                    cnn.input_y: train_by,
                    cnn.input_x_text: list(train_btxt),
                    cnn.input_p1: train_bp1,
                    cnn.input_p2: train_bp2,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    logger.logging_train(step, loss, accuracy)

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    # Generate batches
                    test_batches = data_helpers.batch_iter(list(
                        zip(test_x, test_y, test_text, test_p1, test_p2)),
                                                           FLAGS.batch_size,
                                                           1,
                                                           shuffle=False)
                    # Training loop. For each batch...
                    losses = 0.0
                    accuracy = 0.0
                    predictions = []
                    iter_cnt = 0
                    for test_batch in test_batches:
                        test_bx, test_by, test_btxt, test_bp1, test_bp2 = zip(
                            *test_batch)
                        feed_dict = {
                            cnn.input_text: test_bx,
                            cnn.input_y: test_by,
                            cnn.input_x_text: list(test_btxt),
                            cnn.input_p1: test_bp1,
                            cnn.input_p2: test_bp2,
                            cnn.dropout_keep_prob: 1.0
                        }
                        loss, acc, pred = sess.run(
                            [cnn.loss, cnn.accuracy, cnn.predictions],
                            feed_dict)
                        losses += loss
                        accuracy += acc
                        predictions += pred.tolist()
                        iter_cnt += 1
                    losses /= iter_cnt
                    accuracy /= iter_cnt
                    predictions = np.array(predictions, dtype='int')

                    logger.logging_eval(step, loss, accuracy, predictions)

                    # Model checkpoint
                    if best_f1 < logger.best_f1:
                        best_f1 = logger.best_f1
                        path = saver.save(sess,
                                          checkpoint_prefix +
                                          "-{:.3g}".format(best_f1),
                                          global_step=step)
                        print("Saved model checkpoint to {}\n".format(path))
Beispiel #26
0
import PySimpleGUI as sg
from utils import text_file, Pdf, load_word2vec, retrieve, print_output_sents
import random


word2vec = load_word2vec()


#List of all file names for display in '-FILE LIST-'
files_list = []

#Dictionary of all pdf objects to check if file is in list so as to not create the same pdf objects every time another pdf is uploaded
#Keys are the filename and the values are the Pdf object corresponding to the filename
pdf_obj = {}

#Dictionary of all text file objects
txt_obj = {}


#For printing color to the output so as to differentiate results coming from different documents
cprint = sg.cprint

#List of possible background colors for cprint to use
background_colors = ['yellow', 'orange']

def make_win1():
    file_list_column = [
        [sg.Text('Upload file'), sg.In(size=(25, 1), enable_events=True, key='-FILES-'), sg.FileBrowse()],
        [sg.Checkbox('word2vec', key = '-WORD2VEC-')],
        [sg.Listbox(values=[], size=(40, 20), enable_events=True, select_mode='multiple', key='-FILE LIST-')]]