Ejemplo n.º 1
0
    def __init__(self, model_name, max_length, curdir):
        df_train, df_valid, df_test, intent_names, \
                self.intent_map, self.slot_map = load_prepare_dataset(curdir)
        # Y's:
        self.intent_train = df_train["intent_label"].map(
            self.intent_map).values
        self.intent_valid = df_valid["intent_label"].map(
            self.intent_map).values
        self.intent_test = df_test["intent_label"].map(self.intent_map).values

        tokenizer = BertTokenizer.from_pretrained(model_name)
        self.curdir = curdir
        # X's:
        print('Encoding data...')
        self.encoded_train = encode_dataset(tokenizer, df_train["words"],
                                            max_length)
        self.encoded_valid = encode_dataset(tokenizer, df_valid["words"],
                                            max_length)
        self.encoded_test = encode_dataset(tokenizer, df_test["words"],
                                           max_length)

        self.slot_train = encode_token_labels(df_train["words"],
                                              df_train["word_labels"],
                                              tokenizer, self.slot_map,
                                              max_length)
        self.slot_valid = encode_token_labels(df_valid["words"],
                                              df_valid["word_labels"],
                                              tokenizer, self.slot_map,
                                              max_length)
        self.slot_test = encode_token_labels(df_test["words"],
                                             df_test["word_labels"], tokenizer,
                                             self.slot_map, max_length)

        self.intent_model = SlotIntentDetectorModelBase(
            intent_num_labels=len(self.intent_map),
            slot_num_labels=len(self.slot_map))

        opt = Adam(learning_rate=3e-5, epsilon=1e-08)
        losses = [
            SparseCategoricalCrossentropy(from_logits=True),
            SparseCategoricalCrossentropy(from_logits=True)
        ]

        metrics = [SparseCategoricalAccuracy('accuracy')]
        self.intent_model.compile(optimizer=opt, loss=losses, metrics=metrics)
Ejemplo n.º 2
0
 def predict(self, data_dir):
     teX1, teX2, _ = encode_dataset(self.text_encoder, atec(data_dir))
     teX, teM = self.transform_roc(teX1, teX2)
     self.build_graph()
     self.sess.run([
         p.assign(ip) for p, ip in zip(
             self.params,
             joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))
     ])
     pred_fn = lambda x: np.argmax(x, 1)
     predictions = pred_fn(self.iter_predict(teX, teM))
     return predictions
Ejemplo n.º 3
0
def predict_sub_instances(text_encoder, sub_instances):
    global dataset
    if not len(sub_instances):
        return []
    prems, hyps, ys = zip(*[(sub["premise"], sub["hypothesis"], sub["label"]) for sub in sub_instances])
    test_set = encode_dataset([(prems, hyps, ys)], encoder=text_encoder)
    (tst_p, tst_h, teY) = test_set[0]
    teX, teM = transform_entailment(tst_p, tst_h)
    pred_fn = pred_fns[dataset]
    label_decoder = label_decoders[dataset]

    predictions = pred_fn(iter_predict(teX, teM))
    if label_decoder is not None:
        predictions = [label_decoder[prediction] for prediction in predictions]

    return predictions
Ejemplo n.º 4
0
    logger = ResultLogger(path=os.path.join(
        log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2

    if dataset == 'rocstories':
        (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(
            rocstories(data_dir, n_valid=n_valid), encoder=text_encoder)
        n_y = 2
        n_ctx = min(max(
            [len(x1[:max_len]) + max(len(x2[:max_len]),
                                     len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]
            + [len(x1[:max_len]) + max(len(x2[:max_len]),
                                       len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]
            + [len(x1[:max_len]) + max(len(x2[:max_len]),
                                       len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
        ) + 3,
            n_ctx)
        vocab = n_vocab + n_special + n_ctx
        trX, trM = transform_roc(trX1, trX2, trX3)
        vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
        if submit:
            teX, teM = transform_roc(teX1, teX2, teX3)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir,
                                            '{}log.json'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    dataLoader = DataLoader()
    ((trX, trY), (vaX, vaY),
     (teX, )) = encode_dataset(*dataLoader.veracity(data_dir, topic=topic),
                               encoder=text_encoder)

    encoder['_start_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 2
    max_len = n_ctx - 2
    # Define maximum context as the minimum of [512, x] where x is the max sentence length
    n_ctx = min(
        max([len(x[:max_len])
             for x in trX] + [len(x[:max_len])
                              for x in vaX] + [len(x[:max_len])
                                               for x in teX]) + 3, n_ctx)

    vocab = n_vocab + n_special + n_ctx
    training_engine = TrainingEngine()
Ejemplo n.º 6
0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device {} n_gpu {}".format(device, n_gpu))

    res_logger = ResultLogger(path=os.path.join(log_dir,
                                                '{}.jsonl'.format(desc)),
                              **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    logger.info("Encoding dataset...")
    ((trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY),
     (teX1, teX2, teX3)) = encode_dataset(*rocstories(data_dir,
                                                      n_valid=args.n_valid),
                                          encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(trX1, trX2, trX3)
        ] + [
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
        ] + [
            for step in ins['text']
        ]
        v_passage.append(text)
        v_ing.append(ins['ing'])
        v_gold.append(ins['gold'])
        v_all_ings.append(ins['all_ings'])

    dataset = (
        tlm_passage,
        tlm_ing,
    ), (t_ing, t_gold), (v_ing, v_gold)

    ((
        trlmX1,
        trlmX2,
    ), (trX2, trY), (teX2, teY)) = encode_dataset(*dataset,
                                                  encoder=text_encoder)

    trX1 = encode_dataset_whole(t_passage, encoder=text_encoder)
    teX1 = encode_dataset_whole(v_passage, encoder=text_encoder)
    trX3 = encode_dataset_whole(t_all_ings, encoder=text_encoder)
    teX3 = encode_dataset_whole(v_all_ings, encoder=text_encoder)

    print(n_ctx)
    vocab = n_vocab + n_special + n_ctx

    trlmX, trlmM = transform_recipe_whole_just_recipe(trlmX1, trlmX2, trlmX2)
    trlmX, valmX = trlmX[:-lmval], trlmX[-lmval:]
    trlmM, valmM = trlmM[:-lmval], trlmM[-lmval:]

    trX, trM = transform_recipe_whole(trX1, trX2, trX3)
    trX, vaX = trX[:-taskval], trX[-taskval:]
Ejemplo n.º 8
0
    log_dir = args.log_dir
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    ((trX1, trY), (vaX1, vaY),
     teX1) = encode_dataset(*checkpoint5(data_dir, n_valid=args.n_valid),
                            encoder=text_encoder)

    #print(trX1)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([len(x1[:max_len])
             for x1 in trX1] + [len(x1[:max_len]) for x1 in vaX1] +
            [len(x1[:max_len]) for x1 in teX1]) + 2, n_ctx)

    vocab = n_vocab + n_special + n_ctx
    trX, trM = transform_checkpoint5(trX1)
    data_dir = args.data_dir
    log_dir = args.log_dir
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    start_time = time.time()
    print("Encoding dataset...")
    (trX, vaX) = encode_dataset(*getData(data_dir, n_valid=args.n_valid),
                                encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)

    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = 626 * 2 + 4
    vocab = n_vocab + n_special + n_ctx
    print(vocab)
    trX, trM = transform_roc(trX)
    vaX, vaM = transform_roc(vaX)

    n_train = len(trX)
    n_valid = len(vaX)

    n_batch_train = args.n_batch * max(n_gpu, 1)
    data_dir = args.data_dir
    log_dir = args.log_dir
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    print("Encoding dataset...")
    firstsent, secondsent = getData(data_dir, n_valid=args.n_valid)
    firstbpe, secondbpe = encode_dataset(*(firstsent, secondsent),
                                         encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)

    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = 1256
    vocab = n_vocab + n_special + n_ctx
    n_train = len(firstsent)
    n_valid = len(secondsent)

    n_batch_train = args.n_batch * max(n_gpu, 1)
    n_updates_total = (n_train // n_batch_train) * args.n_iter

    dh_model = LMModel(args, vocab, n_ctx)
Ejemplo n.º 11
0
    def train(self):
        global_step = tf.train.get_or_create_global_step()
        X_train = tf.placeholder(tf.int32, [self.n_batch_train, 2, n_ctx, 2])
        M_train = tf.placeholder(tf.float32, [self.n_batch_train, 2, n_ctx])
        X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2])
        M = tf.placeholder(tf.float32, [None, 2, n_ctx])

        Y_train = tf.placeholder(tf.int32, [self.n_batch_train])
        Y = tf.placeholder(tf.int32, [None])

        #self.train, self.logits, self.clf_losses, self.lm_losses = self.mgpu_train(self.X_train, self.M_train, self.Y_train)

        xs = [X_train, M_train, Y_train]
        gpu_ops = []
        gpu_grads = []
        xs = (tf.split(x, n_gpu, 0) for x in xs)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                           beta1=b1,
                                           beta2=b2,
                                           epsilon=e)
        for i, xs in enumerate(zip(*xs)):
            do_reuse = True if i > 0 else None
            with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(
                    tf.get_variable_scope(), reuse=do_reuse):
                logits, clf_losses, lm_losses = self.model(*xs,
                                                           train=True,
                                                           reuse=do_reuse)
                if lm_coef > 0:
                    train_loss = tf.reduce_mean(
                        clf_losses) + lm_coef * tf.reduce_mean(lm_losses)
                else:
                    train_loss = tf.reduce_mean(clf_losses)
                raw_grads_and_vars = optimizer.compute_gradients(train_loss)
                grads_and_vars = [(tf.clip_by_global_norm([gv[0]],
                                                          max_grad_norm)[0][0],
                                   gv[1]) for gv in raw_grads_and_vars]
                gpu_grads.append(grads_and_vars)
                gpu_ops.append([logits, clf_losses, lm_losses])
        ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
        logits, clf_losses, lm_losses = ops
        grads = average_grads(gpu_grads)

        train_op = optimizer.apply_gradients(grads, global_step=global_step)
        clf_loss = tf.reduce_mean(clf_losses)
        saver = tf.train.Saver(max_to_keep=5)
        self.params = find_trainable_variables('model_lm')

        self.eval_mgpu_logits, self.eval_mgpu_clf_losses, self.eval_mgpu_lm_losses = self.mgpu_predict(
            X_train, M_train, Y_train)
        self.eval_logits, self.eval_clf_losses, self.eval_lm_losses = self.model(
            X, M, Y, train=False, reuse=True)
        self.eval_clf_loss = tf.reduce_mean(self.eval_clf_losses)
        self.eval_mgpu_clf_loss = tf.reduce_mean(self.eval_mgpu_clf_losses)

        summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES)

        def trva_split(data, index):
            return [data[i] for i in index]

        x1, x2, y = encode_dataset(self.text_encoder, atec(data_dir))

        valid_index = np.load('data/valid_index.npy')
        if data_dir == 'data/para.tsv':
            valid_index = np.concatenate([
                valid_index, valid_index + len(y) // 4,
                valid_index + len(y) // 2, valid_index + 3 * len(y) // 4
            ])
        valid_index = valid_index.tolist()
        train_index = list(set(valid_index) ^ set(range(len(y))))
        trX1, trX2, trY = trva_split(x1, train_index), trva_split(
            x2, train_index), trva_split(y, train_index)
        vaX1, vaX2, vaY = trva_split(x1, valid_index), trva_split(
            x2, valid_index), trva_split(y, valid_index)
        trX, trM = self.transform_roc(trX1, trX2)
        vaX, vaM = self.transform_roc(vaX1, vaX2)

        n_train = len(trY)
        n_valid = len(vaY)
        self.n_updates_total = (n_train // self.n_batch_train) * n_iter
        self.build_graph()
        if pre_load:
            shapes = json.load(open('model/params_shapes.json'))
            offsets = np.cumsum([np.prod(shape) for shape in shapes])
            init_params = [
                np.load('model/params_{}.npy'.format(n)) for n in range(10)
            ]
            init_params = np.split(np.concatenate(init_params, 0),
                                   offsets)[:-1]
            init_params = [
                param.reshape(shape)
                for param, shape in zip(init_params, shapes)
            ]
            init_params[0] = init_params[0][:+n_ctx]
            init_params[0] = np.concatenate([
                init_params[1],
                (np.random.randn(self.n_special, n_embd) * 0.02).astype(
                    np.float32), init_params[0]
            ], 0)
            del init_params[1]

            if self.n_transfer == -1:
                self.n_transfer = 0
            else:
                self.n_transfer = 1 + self.n_transfer * 12
            self.sess.run([
                p.assign(ip) for p, ip in zip(self.params[:self.n_transfer],
                                              init_params[:self.n_transfer])
            ])
        if not new_model:
            print('loading old model')
            self.load()
            print('load success')
        n_updates = 0
        n_epochs = 0
        self.save(os.path.join(save_dir, desc, 'best_params.jl'))
        self.best_score = 0

        def log():
            def iter_apply(Xs, Ms, Ys):
                fns = [
                    lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))
                ]
                results = []
                for xmb, mmb, ymb in iter_data((Xs, Ms, Ys),
                                               n_batch=self.n_batch_train,
                                               truncate=False,
                                               verbose=True):
                    n = len(xmb)
                    if n == self.n_batch_train:
                        res = sess.run(
                            [self.eval_mgpu_logits, self.eval_mgpu_clf_loss], {
                                X_train: xmb,
                                M_train: mmb,
                                Y_train: ymb
                            })
                    else:
                        res = sess.run([self.eval_logits, self.eval_clf_loss],
                                       {
                                           X: xmb,
                                           M: mmb,
                                           Y: ymb
                                       })
                    res = [r * n for r in res]
                    results.append(res)
                results = zip(*results)
                return [fn(res) for res, fn in zip(results, fns)]

            # global best_score
            tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid],
                                            trY[:n_valid])
            va_logits, va_cost = iter_apply(vaX, vaM, vaY)
            tr_cost = tr_cost / len(trY[:n_valid])
            va_cost = va_cost / n_valid
            tr_f1 = f1_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
            va_f1 = f1_score(vaY, np.argmax(va_logits, 1)) * 100.
            self.logger.log(n_epochs=n_epochs,
                            n_updates=n_updates,
                            tr_cost=tr_cost,
                            va_cost=va_cost,
                            tr_f1=tr_f1,
                            va_f1=va_f1)
            print('%d %d %.3f %.3f %.2f %.2f' %
                  (n_epochs, n_updates, tr_cost, va_cost, tr_f1, va_f1))
            score = va_f1
            if score > self.best_score:
                self.best_score = score
                self.save(os.path.join(save_dir, desc, 'best_params.jl'))

        for i in range(n_iter):
            for xmb, mmb, ymb in iter_data(
                (shuffle(trX, trM, trY, random_state=np.random)),
                    n_batch=self.n_batch_train,
                    truncate=True,
                    verbose=True):
                cost, _ = self.sess.run([self.clf_loss, self.train], {
                    self.X_train: xmb,
                    self.M_train: mmb,
                    self.Y_train: ymb
                })
                n_updates += 1
                if n_updates % 1000 == 0:
                    log()
            n_epochs += 1
            log()
Ejemplo n.º 12
0
    def ccc_train(self):
        # Resolve hostnames and ports of other nodes
        host, hosts = client(bootstrap_host, bootstrap_port)

        # Create a cluster and identify the job name and task of this node
        cluster = tf.train.ClusterSpec({
            'ps': hosts[:num_ps],
            'worker': hosts[num_ps:]
        })

        task = hosts.index(host)
        job_name = ('ps', 'worker')[task >= num_ps]
        task = cluster.job_tasks(job_name).index(host)
        tf_config = tf.ConfigProto(allow_soft_placement=True)
        tf_config.gpu_options.allow_growth = True
        server = tf.train.Server(cluster,
                                 job_name=job_name,
                                 task_index=task,
                                 config=tf_config)

        if job_name == 'ps':
            # create a shared queue on the parameter server which is visible on /job:ps/task:%d
            with tf.device('/job:ps/task:%d' % task):
                queue = tf.FIFOQueue(cluster.num_tasks('worker'),
                                     tf.int32,
                                     shared_name='done_queue%d' % task)

            # wait for the queue to be filled
            with tf.Session(server.target) as sess:
                for i in range(cluster.num_tasks('worker')):
                    sess.run(queue.dequeue())
                    print('ps:%d received "done" from worker:%d' % (task, i))
                print('ps:%d quitting' % task)

        elif job_name == 'worker':
            with tf.device(
                    tf.train.replica_device_setter(
                        worker_device='/job:worker/task:%d' % task,
                        cluster=cluster)):
                global_step = tf.train.get_or_create_global_step()

                sentences = self.batched_data(
                    tfrecord_filename,
                    self.single_example_parser,
                    self.n_batch_train,
                    padded_shapes=tf.Dimension(n_ctx),
                    num_epochs=n_iter)
                sentences = tf.cast(sentences, tf.int32)
                max_len = tf.shape(sentences)[1]  #sentences.get_shape()[1]
                xmb = tf.reshape(sentences,
                                 [self.n_batch_train, 1, max_len, 1])
                M_train = tf.cast(
                    tf.reshape(tf.sign(xmb), [self.n_batch_train, 1, max_len]),
                    tf.float32)
                positions = tf.reshape(tf.range(
                    self.n_vocab + self.n_special,
                    self.n_vocab + self.n_special + max_len),
                                       shape=[1, 1, max_len, 1])
                #tf.constant(np.arange(self.n_vocab + self.n_special, self.n_vocab + self.n_special + max_len),shape=[1, 1, max_len, 1])
                positions = tf.tile(positions, [self.n_batch_train, 1, 1, 1])
                X_train = tf.concat([xmb, positions], axis=3)

                optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                                   beta1=b1,
                                                   beta2=b2,
                                                   epsilon=e)
                gpu_grads = []
                gpu_loss = []
                gpu_ppl = []
                xs = [X_train, M_train]
                xs = (tf.split(x, n_gpu, 0) for x in xs)
                for i, xs in enumerate(zip(*xs)):
                    do_reuse = True if i > 0 else None
                    with tf.device(assign_to_gpu(i)), tf.variable_scope(
                            tf.get_variable_scope(), reuse=do_reuse):
                        lm_losses = self.model(*xs, train=True, num_ps=num_ps)
                        train_ppl_single = tf.reduce_mean(math.e**lm_losses)
                        train_loss_single = tf.reduce_mean(lm_losses)
                        gpu_loss.append(train_loss_single)
                        gpu_ppl.append(train_ppl_single)
                        optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                                           beta1=b1,
                                                           beta2=b2,
                                                           epsilon=e)
                        raw_grads_and_vars = optimizer.compute_gradients(
                            train_loss_single)
                        grads_and_vars = [
                            (tf.clip_by_global_norm([gv[0]],
                                                    max_grad_norm)[0][0],
                             gv[1]) for gv in raw_grads_and_vars
                        ]
                        gpu_grads.append(grads_and_vars)

                train_ppl = tf.reduce_mean(gpu_ppl)
                train_loss = tf.reduce_mean(gpu_loss)
                grads = average_grads(gpu_grads)

                train_op = optimizer.apply_gradients(grads,
                                                     global_step=global_step)

                saver = tf.train.Saver(max_to_keep=5)

                X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2])
                M = tf.placeholder(tf.float32, [None, 1, n_ctx])
                valid_lm_losses = self.model(X, M, train=False, reuse=True)
                valid_ppl = tf.reduce_mean(math.e**valid_lm_losses)
                valid_loss = tf.reduce_mean(valid_lm_losses)

                self.params = find_trainable_variables('model_lm')
                tf.summary.scalar('train_loss', train_loss)
                #tf.summary.scalar('valid_loss', valid_loss)
                tf.summary.scalar('train_ppl', train_ppl)
                #tf.summary.scalar('valid_ppl', valid_ppl)
                summary_op = tf.summary.merge_all()

            done_ops = []
            # create a shared queue on the worker which is visible on /job:ps/task:%d
            for i in range(cluster.num_tasks('ps')):
                with tf.device('/job:ps/task:%d' % i):
                    with tf.name_scope('done_queue'):
                        done_queue = tf.FIFOQueue(cluster.num_tasks('worker'),
                                                  tf.int32,
                                                  shared_name='done_queue' +
                                                  str(i))
                        done_ops.append(done_queue.enqueue(task))
            scaffold = tf.train.Scaffold(saver=saver)
            summary_hook = tf.train.SummarySaverHook(save_steps=1000,
                                                     output_dir=save_dir,
                                                     summary_op=summary_op)
            hooks = [
                summary_hook,  # tf.train.CheckpointSaverHook(save_secs=600, checkpoint_dir=save_dir, saver=saver),
                tf.train.StopAtStepHook(last_step=1000000),
                tf.train.LoggingTensorHook(
                    {
                        'step': global_step,
                        'train_loss': train_loss,
                        'ppl': train_ppl
                    },
                    every_n_iter=100),
                tf.train.FinalOpsHook([done_ops])
            ]
            valid_data = pre_train_valid(valid_dir)
            vaX1 = encode_dataset(self.text_encoder, pre_train(valid_data))[0]
            vaX, vaM = self.transform_roc(vaX1)
            with tf.train.MonitoredTrainingSession(master=server.target,
                                                   is_chief=(task == 0),
                                                   hooks=hooks,
                                                   save_checkpoint_secs=600,
                                                   checkpoint_dir=save_dir,
                                                   scaffold=scaffold) as sess:
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(sess=sess, coord=coord)
                try:
                    while not coord.should_stop():

                        ppl, loss, _, step = sess.run([
                            train_ppl, train_loss, train_op, global_step
                        ])  #,options=run_options, run_metadata=run_metadata)
                        if step % steps_to_validate == 0:
                            va_cost = []
                            va_ppl = []
                            for xm, mm in iter_data((vaX, vaM),
                                                    n_batch=self.n_batch_train,
                                                    truncate=False,
                                                    verbose=True):

                                ps = sess.run(self.params)
                                joblib.dump(ps,
                                            save_dir + 'model_lm.params',
                                            protocol=2)
                                res, ppl = sess.run([valid_loss, valid_ppl], {
                                    X: xm,
                                    M: mm
                                })
                                va_cost.append(np.sum(res))
                                va_ppl.append(np.sum(ppl))

                            va_cost = np.average(va_cost)
                            va_ppl = np.average(va_ppl)
                            tf.logging.info(
                                '=========n_steps:\t%d valid_cost:\t%.3f valid ppl:\t%.3f=========='
                                % (step, va_cost, va_ppl))

                except tf.errors.OutOfRangeError:
                    print('Epochs Complete!')
                finally:
                    coord.request_stop()
                coord.join(threads)
Ejemplo n.º 13
0
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    # data preprocess
    (trX1, trX2, trX3, trX4, trX5, trY), \
    (vaX1, vaX2, vaX3, vaX4, vaX5, vaY), \
    (teX1, teX2, teX3, teX4, teX5, teY) = encode_dataset(text_encoder, race(data_dir))
    n_y = 4
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    # max_len = n_ctx//2-2
    # n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx)
    max_len = n_ctx
    trX, trM = transform_race(trX1, trX2, trX3, trX4, trX5)
    vaX, vaM = transform_race(vaX1, vaX2, vaX3, vaX4, vaX5)
    if submit:
        teX, teM = transform_race(teX1, teX2, teX3, teX4, teX5)

    n_train = len(trY)
Ejemplo n.º 14
0
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    # data preprocess
    (vaX1, vaX2, vaX3, vaX4, vaX5, vaY), \
    (teX1, teX2, teX3, teX4, teX5, teY), \
    (m_teX1, m_teX2, m_teX3, m_teX4, m_teX5, m_teY), \
    (h_teX1, h_teX2, h_teX3, h_teX4, h_teX5, h_teY) = encode_dataset(text_encoder, race(data_dir, is_train=False))
    n_y = 4
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    # max_len = n_ctx//2-2
    # n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx)
    max_len = n_ctx
    vaX, vaM = transform_race(vaX1, vaX2, vaX3, vaX4, vaX5)
    teX, teM = transform_race(teX1, teX2, teX3, teX4, teX5)
    m_teX, m_teM = transform_race(m_teX1, m_teX2, m_teX3, m_teX4, m_teX5)
    h_teX, h_teM = transform_race(h_teX1, h_teX2, h_teX3, h_teX4, h_teX5)

    n_valid = len(vaY)
Ejemplo n.º 15
0
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    ((trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY),
     (teX1, teX2, teX3)) = encode_dataset(*anli(data_dir,
                                                n_valid=args.n_valid),
                                          encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(trX1, trX2, trX3)
        ] + [
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
        ] + [
Ejemplo n.º 16
0
    v_ing.append('ok')
    v_gold.append([0])
    v_all_ings.append(['a','b'])
    '''
    for ins in val_file:
        text= [step.replace('-lrb-','(').replace('-rrb-', ')') for step in ins['ve_$replaced_text']]
        v_passage.append(text)
        v_ing.append(ins['ing'])
        v_gold.append(ins['gold'])
        v_all_ings.append(ins['all_ings'])
    #print(tlm_passage[0])


    a = (tlm_passage, tlm_ing,), (t_ing,t_gold),(v_ing,v_gold)

    ((trlmX1, trlmX2,),(trX2, trY),(vaX2, vaY)) = encode_dataset(*a,encoder = text_encoder)

    #trlmX1 = encode_dataset_whole(tlm_passage, encoder = text_encoder)
    trX1 = encode_dataset_whole(t_passage, encoder = text_encoder)
    vaX1 = encode_dataset_whole(v_passage, encoder = text_encoder)
    print(vaX1[0][1])
    trX3 = encode_dataset_whole(t_all_ings, encoder = text_encoder)
    vaX3 = encode_dataset_whole(v_all_ings, encoder = text_encoder)
    n_batch_train = args.n_batch * max(n_gpu, 1)
    print(n_ctx)
    vocab = n_vocab + n_special + n_ctx
    trlmX, trlmM = transform_recipe_whole_just_recipe(trlmX1, trlmX2,trlmX2)

    trlmX, valmX = trlmX[:-lmval], trlmX[-lmval:]
    trlmM, valmM = trlmM[:-lmval], trlmM[-lmval:]
    trX, trM = transform_recipe_whole(trX1, trX2, trX3)
Ejemplo n.º 17
0
    dataset = args.dataset

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    log_file = os.path.join(log_dir, '{}.jsonl'.format(dataset))
    logger = ResultLogger(path=log_file, **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    ((trX, trY), (vaX, vaY), (teX, teY)) = encode_dataset(
        *preprocess_fns[dataset](data_dir, sentence_pair=args.sentence_pair),
        encoder=text_encoder,
        skip_preprocess=args.skip_preprocess)
    encoder['_start_'] = len(encoder)
    if args.sentence_pair or args.force_delimiter:
        encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 2 + int('_delimiter_' in encoder)
    if args.sentence_pair:
        max_len = n_ctx // 2 - 2
    else:
        max_len = n_ctx - n_special
    if not args.force_max_ctx:
        if args.sentence_pair:
            n_ctx = min(
                sum(
Ejemplo n.º 18
0
    parser.add_argument('--e', type=float, default=1e-8)

    args = parser.parse_args()
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    #tf.random.set_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx//2-2
    n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx)
    trX, trM = transform_roc(trX1, trX2, trX3)
    vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
    if submit:
        teX, teM = transform_roc(teX1, teX2, teX3)

    n_train = len(trY)
    n_valid = len(vaY)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device", device, "n_gpu", n_gpu)

# logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)))

text_encoder = TextEncoder(encoder_path, bpe_path)
encoder = text_encoder.encoder
n_vocab = len(text_encoder.encoder)


print("Encoding dataset...")

((trX, trY), (vaX, vaY), _) = encode_dataset(*imdb(data_dir, n_train=100, n_valid=1000),
                                        encoder=text_encoder)


encoder['_start_'] = len(encoder)
encoder['_delimiter_'] = len(encoder)
encoder['_classify_'] = len(encoder)
clf_token = encoder['_classify_']
n_special = 3
max_len = n_ctx - 2
vocab = n_vocab + n_special + n_ctx

def transform_imdb(X):
    n_batch = len(X)
    xmb = np.zeros((n_batch, n_ctx, 2), dtype=np.int32)
    mmb = np.zeros((n_batch, n_ctx), dtype=np.float32)
    start = encoder['_start_']
Ejemplo n.º 20
0
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2

    if dataset == 'rocstories':
        (trX1, trX2, trX3,
         trY), (vaX1, vaX2, vaX3,
                vaY), (teX1, teX2,
                       teX3) = encode_dataset(rocstories(data_dir,
                                                         n_valid=n_valid),
                                              encoder=text_encoder)
        n_y = 2
        n_ctx = min(
            max([
                len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
                for x1, x2, x3 in zip(trX1, trX2, trX3)
            ] + [
                len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
                for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
            ] + [
                len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
                for x1, x2, x3 in zip(teX1, teX2, teX3)
            ]) + 3, n_ctx)
        vocab = n_vocab + n_special + n_ctx
        trX, trM = transform_roc(trX1, trX2, trX3)
Ejemplo n.º 21
0
    # torch.device object used throughout this script TODO add gpu setting
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    (trX1, trX2, trX3,
     trY), (vaX1, vaX2, vaX3,
            vaY), (teX1, teX2,
                   teX3) = encode_dataset(rocstories(data_dir,
                                                     n_valid=args.n_valid),
                                          encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(trX1, trX2, trX3)
        ] + [
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
        val_gold.append(int(ins['gold']))

    #train_context[0] = '_delimiter_2'

    train_context[0] = 'Abercrombie'
    print(train_context[0])
    print('\n')
    print(train_op1[0])
    print('\n')
    a = (train_context, train_op1, train_op2, train_op3, train_op4,
         train_gold), (val_context, val_op1, val_op2, val_op3, val_op4,
                       val_gold)
    print("Encoding dataset...")
    ((trX1, trX2, trX3, trX4, trX5, trY),
     (vaX1, vaX2, vaX3, vaX4, vaX5,
      vaY)) = encode_dataset(*a, encoder=text_encoder)

    print(trX1[0])
    print('\n')
    print(trX2[0])
    print('\n')

    clf_token = encoder['_classify_']
    n_special = 5
    max_len = 510
    n_ctx = min(
        max([
            max(len(x2[:max_len]), len(x3[:max_len]), len(x4[:max_len]),
                len(x5[:max_len]))
            for x1, x2, x3, x4, x5 in zip(trX1, trX2, trX3, trX4, trX5)
        ] + [
        desc: None,
    }

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    ((trX, trY), (vaX, vaY),
     (teX, )) = encode_dataset(*multiclas(data_dir, n_valid=args.n_valid),
                               encoder=text_encoder)

    encoder['_start_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 2
    max_len = n_ctx - 2
    # Define maximum context as the minimum of [512, x] where x is the max sentence length
    n_ctx = min(
        max([len(x[:max_len])
             for x in trX] + [len(x[:max_len])
                              for x in vaX] + [len(x[:max_len])
                                               for x in teX]) + 3, n_ctx)

    vocab = n_vocab + n_special + n_ctx
    trX, trM = transform_sst2(trX)
Ejemplo n.º 24
0
                elif X1[i][j] == '[[/HL]]':
                    hl = 0
                else:
                    X += [X1[i][j]]
                    if hl == 1:
                        H += [hl1t]
                    else:
                        H += [hl2t]
            H1 += [H]
            X1n += [X]
        return X1n, H1

    (trX1, trX2, trX3, trX4, trX5, trX6, trX7, trX8, trX9,
     trY), (vaX1, vaX2, vaX3, vaX4, vaX5, vaX6, vaX7, vaX8, vaX9,
            vaY), (teX1, teX2, teX3, teX4, teX5, teX6, teX7, teX8,
                   teX9) = encode_dataset(race(data_dir), encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    encoder['_hl1_'] = len(encoder)
    encoder['_hl2_'] = len(encoder)
    clf_token = encoder['_classify_']
    trX1, trH1 = get_hl(trX1, encoder['_hl1_'], encoder['_hl2_'])
    vaX1, vaH1 = get_hl(vaX1, encoder['_hl1_'], encoder['_hl2_'])
    teX1, teH1 = get_hl(teX1, encoder['_hl1_'], encoder['_hl2_'])

    trX7, trH7 = get_hl(trX7, encoder['_hl1_'], encoder['_hl2_'])
    vaX7, vaH7 = get_hl(vaX7, encoder['_hl1_'], encoder['_hl2_'])
    teX7, teH7 = get_hl(teX7, encoder['_hl1_'], encoder['_hl2_'])

    trX8, trH8 = get_hl(trX8, encoder['_hl1_'], encoder['_hl2_'])
Ejemplo n.º 25
0
                elif X1[i][j] == '[[/SQ]]':
                    sq = 0
                else:
                    X += [X1[i][j]]
                    if sq == 1:
                        S += [s1t]
                    else:
                        S += [s2t]
            S1 += [S]
            X1n += [X]
        return X1n, S1

    (trX1, trX2, trX3, trX4, trX5,
     trY), (vaX1, vaX2, vaX3, vaX4, vaX5,
            vaY), (teX1, teX2, teX3, teX4,
                   teX5) = encode_dataset(dream(data_dir),
                                          encoder=text_encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    encoder['_speaker1_'] = len(encoder)
    encoder['_speaker2_'] = len(encoder)
    clf_token = encoder['_classify_']
    trX1, trS1 = get_speaker(trX1, encoder['_speaker1_'],
                             encoder['_speaker2_'])
    vaX1, vaS1 = get_speaker(vaX1, encoder['_speaker1_'],
                             encoder['_speaker2_'])
    teX1, teS1 = get_speaker(teX1, encoder['_speaker1_'],
                             encoder['_speaker2_'])

    n_special = len(encoder)
Ejemplo n.º 26
0
    data_dir = args.data_dir
    log_dir = args.log_dir
    submission_dir = args.submission_dir
    topic = args.topic

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    ((trX, trY), (vaX, vaY), (teX, )) = encode_dataset(*stance(data_dir, topic=topic),
     encoder=text_encoder)

    encoder['_start_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 2
    max_len = n_ctx - 2
    # Define maximum context as the minimum of [512, x] where x is the max sentence length
    n_ctx = min(max(
    [len(x[:max_len]) for x in trX]
    + [len(x[:max_len]) for x in vaX]
    + [len(x[:max_len]) for x in teX]
    ) + 3, n_ctx)

    vocab = n_vocab + n_special + n_ctx
    trX, trM = transform_stance(trX)
Ejemplo n.º 27
0
 def encode(self, text_sequence, max_length):
     return encode_dataset(self.tokenizer, text_sequence, max_length)
Ejemplo n.º 28
0
    args = parser.parse_args()
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    (trX1, trX2, trX3,
     trY), (vaX1, vaX2, vaX3,
            vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir),
                                                      encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(trX1, trX2, trX3)
        ] + [
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
        ] + [
Ejemplo n.º 29
0
        print("device", device, "n_gpu", n_gpu)

        # Initialize the logger:
        logger = ResultLogger(path=os.path.join(log_dir,
                                                '{}.jsonl'.format(desc)),
                              **args.__dict__)

        # Initialize the text encoder with the vocabulary and encoder file:
        text_encoder = TextEncoder(encoder_path, bpe_path)
        encoder = text_encoder.encoder
        n_vocab = len(text_encoder.encoder)

        # Encode the different datasets using the text encoders:
        print("Encoding dataset...")
        ((trX, trY), (vaX, vaY),
         (teX, )) = encode_dataset(*stance(data_dir, train_file, test_file),
                                   encoder=text_encoder)

        encoder['_start_'] = len(encoder)
        encoder['_classify_'] = len(encoder)
        clf_token = encoder['_classify_']
        n_special = 2
        max_len = n_ctx - 2

        # Define maximum context as the minimum of [512, x] where x is the max sentence length:
        n_ctx = min(
            max([len(x[:max_len])
                 for x in trX] + [len(x[:max_len]) for x in vaX] +
                [len(x[:max_len]) for x in teX]) + 3, n_ctx)

        # Apply word embedding on the training and validation datasets:
        vocab = n_vocab + n_special + n_ctx
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    #(trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder)
    #enco_ry = ruoyao(data_dir)
    #(trX1,trX2,tyY), (vaX1, vaX2, vaY), (teX1, teX2) = ruoyao(data_dir)
    #print(trX1[0])
    (trX1,trX2,trY), (vaX1, vaX2, vaY), (teX1, teX2, teY) = encode_dataset(ruoyao(data_dir), encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx//2-2
    n_ctx = min(max([len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(trX1, trX2)]+[len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(vaX1, vaX2)]+[len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(teX1, teX2)])+3, n_ctx)
    trX, trM = transform_ruoyao(trX1, trX2)
    vaX, vaM = transform_ruoyao(vaX1, vaX2)
    if submit:
        teX, teM = transform_ruoyao(teX1, teX2)

    n_train = len(trY)
    n_valid = len(vaY)
Ejemplo n.º 31
0
    def train(self):
        global_step = tf.train.get_or_create_global_step()
        X_train = tf.placeholder(tf.int32, [self.n_batch_train, 2, n_ctx, 2])
        M_train = tf.placeholder(tf.float32, [self.n_batch_train, 2, n_ctx])
        X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2])
        M = tf.placeholder(tf.float32, [None, 2, n_ctx])

        Y_train = tf.placeholder(tf.int32, [self.n_batch_train])
        Y = tf.placeholder(tf.int32, [None])

        #self.train, self.logits, self.clf_losses, self.lm_losses = self.mgpu_train(self.X_train, self.M_train, self.Y_train)

        xs = [X_train, M_train, Y_train]
        gpu_ops = []
        gpu_grads = []
        xs = (tf.split(x, n_gpu, 0) for x in xs)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                           beta1=b1,
                                           beta2=b2,
                                           epsilon=e)
        for i, xs in enumerate(zip(*xs)):
            do_reuse = True if i > 0 else None
            with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(
                    tf.get_variable_scope(), reuse=do_reuse):
                logits, clf_losses, lm_losses = self.model(*xs,
                                                           train=True,
                                                           reuse=do_reuse)
                if lm_coef > 0:
                    train_loss = tf.reduce_mean(
                        clf_losses) + lm_coef * tf.reduce_mean(lm_losses)
                else:
                    train_loss = tf.reduce_mean(clf_losses)
                raw_grads_and_vars = optimizer.compute_gradients(train_loss)
                grads_and_vars = [(tf.clip_by_global_norm([gv[0]],
                                                          max_grad_norm)[0][0],
                                   gv[1]) for gv in raw_grads_and_vars]
                gpu_grads.append(grads_and_vars)
                gpu_ops.append([logits, clf_losses, lm_losses])
        ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
        logits, clf_losses, lm_losses = ops
        grads = average_grads(gpu_grads)

        train_op = optimizer.apply_gradients(grads, global_step=global_step)
        clf_loss = tf.reduce_mean(clf_losses)
        saver = tf.train.Saver(max_to_keep=5)
        self.params = find_trainable_variables('model_lm')
        if pre_load:
            restore_op = [
                p.assign(ip) for p, ip in zip(
                    self.params, joblib.load(lm_dir + '/model_lm.params'))
            ]
        self.eval_mgpu_logits, self.eval_mgpu_clf_losses, self.eval_mgpu_lm_losses = self.mgpu_predict(
            X_train, M_train, Y_train)
        self.eval_logits, self.eval_clf_losses, self.eval_lm_losses = self.model(
            X, M, Y, train=False, reuse=True)
        self.eval_clf_loss = tf.reduce_mean(self.eval_clf_losses)
        self.eval_mgpu_clf_loss = tf.reduce_mean(self.eval_mgpu_clf_losses)

        summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES)

        def trva_split(data, index):
            return [data[i] for i in index]

        x1, x2, y = encode_dataset(self.text_encoder, atec(data_dir))

        valid_index = np.load('data/valid_index.npy')
        if data_dir == 'data/para.tsv':
            valid_index = np.concatenate([
                valid_index, valid_index + len(y) // 4,
                valid_index + len(y) // 2, valid_index + 3 * len(y) // 4
            ])
        valid_index = valid_index.tolist()
        train_index = list(set(valid_index) ^ set(range(len(y))))
        trX1, trX2, trY = trva_split(x1, train_index), trva_split(
            x2, train_index), trva_split(y, train_index)
        vaX1, vaX2, vaY = trva_split(x1, valid_index), trva_split(
            x2, valid_index), trva_split(y, valid_index)
        trX, trM = self.transform_roc(trX1, trX2)
        vaX, vaM = self.transform_roc(vaX1, vaX2)

        n_train = len(trY)
        n_valid = len(vaY)
        self.n_updates_total = (n_train // self.n_batch_train) * n_iter

        def log():
            def iter_apply(Xs, Ms, Ys):
                fns = [
                    lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))
                ]
                results = []
                for xmb, mmb, ymb in iter_data((Xs, Ms, Ys),
                                               n_batch=self.n_batch_train,
                                               truncate=False,
                                               verbose=True):
                    n = len(xmb)
                    if n == self.n_batch_train:
                        res = sess.run(
                            [self.eval_mgpu_logits, self.eval_mgpu_clf_loss], {
                                X_train: xmb,
                                M_train: mmb,
                                Y_train: ymb
                            })
                    else:
                        res = sess.run([self.eval_logits, self.eval_clf_loss],
                                       {
                                           X: xmb,
                                           M: mmb,
                                           Y: ymb
                                       })
                    res = [r * n for r in res]
                    results.append(res)
                results = zip(*results)
                return [fn(res) for res, fn in zip(results, fns)]

            # global best_score
            tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid],
                                            trY[:n_valid])
            va_logits, va_cost = iter_apply(vaX, vaM, vaY)
            tr_cost = tr_cost / len(trY[:n_valid])
            va_cost = va_cost / n_valid
            tr_f1 = f1_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
            va_f1 = f1_score(vaY, np.argmax(va_logits, 1)) * 100.
            tf.logging.info(
                '%d %d %.3f %.3f %.2f %.2f' %
                (n_epochs, n_updates, tr_cost, va_cost, tr_f1, va_f1))

        scaffold = tf.train.Scaffold(saver=saver)
        log_hook = tf.train.LoggingTensorHook(
            {
                'step': global_step,
                'train_loss': clf_loss
            }, every_n_iter=100)
        summary_hook = tf.train.SummarySaverHook(save_steps=100,
                                                 output_dir=save_dir,
                                                 summary_op=summary_op)
        hooks = [summary_hook, log_hook]
        tf_config = tf.ConfigProto(allow_soft_placement=True)
        tf_config.gpu_options.allow_growth = True

        n_epochs = 0

        with tf.train.MonitoredTrainingSession(hooks=hooks,
                                               save_checkpoint_secs=600,
                                               checkpoint_dir=save_dir,
                                               scaffold=scaffold,
                                               config=tf_config) as sess:
            if pre_load:
                sess.run(restore_op)

            for i in range(n_iter):
                for xmb, mmb, ymb in iter_data(
                    (shuffle(trX, trM, trY, random_state=np.random)),
                        n_batch=self.n_batch_train,
                        truncate=True,
                        verbose=True):
                    cost, _, n_updates = sess.run(
                        [clf_loss, train_op, global_step], {
                            X_train: xmb,
                            M_train: mmb,
                            Y_train: ymb
                        })
                    if n_updates % 100 == 0:
                        log()
                n_epochs += 1
                log()