コード例 #1
0
ファイル: main.py プロジェクト: yanakk/RACE_AR_baselines
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')
    question_belong = []
    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling, question_belong=question_belong)
    else:
#        logging.info('*' * 10 + ' Train')
#        train_examples = utils.load_data(args.train_file, relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, question_belong=question_belong)

#    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
#    word_dict = utils.build_dict(train_examples[0] + train_examples[1] + train_examples[2], args.max_vocab_size)
    word_dict = pickle.load(open("../../obj/dict.pkl", "rb"))
    logging.info('-' * 50)
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, attention_fn, params, all_params = build_fn(args, embeddings)
    logging.info('Done.')
    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_x3, dev_y = utils.vectorize(dev_examples, word_dict, sort_by_len=not args.test_only, concat=args.concat)
    word_dict_r = {}
    word_dict_r[0] = "unk"
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, args.batch_size, args.concat)
    dev_acc, n_examples, prediction, all_examples= eval_acc(test_fn, all_dev)
    
    logging.info('Dev accuracy: %.2f %%' % dev_acc.mean())
    print(dev_acc.mean())
    
    alpha= attention_func(attention_fn, all_dev)
    
    if args.test_only:
        return dev_acc,n_examples, prediction, all_examples, alpha
コード例 #2
0
ファイル: main_SAR.py プロジェクト: yanakk/RACE_AR_baselines
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')
    if not (args.test_only):
        logging.info('*' * 10 + ' All')
        all_examples = utils.load_data(args.all_file,
                                       100,
                                       relabeling=args.relabeling)
        dev_ratio = args.dev_ratio
        sample_index = np.arange(len(all_examples[0]))
        random.seed(1000)
        dev_index = random.sample(sample_index,
                                  int(dev_ratio * len(sample_index)))
        train_index = np.setdiff1d(sample_index, dev_index)
        dev_examples = tuple_part(all_examples, dev_index)
        train_examples = tuple_part(all_examples, train_index)
        #feature preprocessing
        train_fea_flat_np = FeaExtract(train_examples[-1])
        dev_fea_flat_np = FeaExtract(dev_examples[-1])
        train_fea_flat_np2, dev_fea_flat_np2 = Prepocessing_func(
            train_fea_flat_np,
            dev_fea_flat_np,
            varian_ratio_tol=args.pca_ratio)
        train_fea_merge = FeaMerge(train_fea_flat_np2, train_examples[-1])
        dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1])
        train_examples = train_examples[:-1] + (train_fea_merge, )
        dev_examples = dev_examples[:-1] + (dev_fea_merge, )
        args.num_train = len(train_examples[0])
    else:
        #        logging.info('*' * 10 + ' Train')
        #        train_examples = utils.load_data(args.train_file, relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       args.max_dev,
                                       relabeling=args.relabeling)
        dev_fea_flat_np = FeaExtract(dev_examples[-1])
        dev_fea_flat_np2 = PrepocessingApply_func(dev_fea_flat_np)
        dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1])
        dev_examples = dev_examples[:-1] + (dev_fea_merge, )

    args.num_dev = len(dev_examples[0])
    args.mea_num = dev_examples[4][0].shape[-1]

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = pickle.load(open("../../obj/dict.pkl", "rb"))
    logging.info('-' * 50)
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params, all_params = build_fn(args, embeddings)
    logging.info('Done.')
    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_x3, dev_y, dev_x4 = utils.vectorize(
        dev_examples,
        word_dict,
        sort_by_len=not args.test_only,
        concat=args.concat)
    word_dict_r = {}
    word_dict_r[0] = "unk"
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, dev_x4,
                           args.batch_size, args.concat)
    dev_acc, rediction = eval_acc(test_fn, all_dev)

    logging.info('Dev accuracy: %.2f %%' % dev_acc.mean())
    print(dev_acc.mean())

    best_dev_acc = dev_acc
    best_train_acc = 0
    if args.test_only:
        return dev_acc, best_train_acc
    utils.save_params(args.model_file, all_params, epoch=0, n_updates=0)
    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_x3, train_y, train_x4 = utils.vectorize(
        train_examples, word_dict, concat=args.concat)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0

    all_train = gen_examples(train_x1, train_x2, train_x3, train_y, train_x4,
                             args.batch_size, args.concat)
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y,
                  mb_x4, mb_mask4) in enumerate(all_train):

            train_loss = train_fn(mb_x1, mb_mask1, mb_x3, mb_mask3, mb_y,
                                  mb_x4)
            #            if idx % 100 == 0:
            #            if epoch % 100 == 0:
            #                logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
            #                logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                print([x.get_value() for x in params])
                print([x.get_value() for x in all_params])
                samples = sorted(
                    np.random.choice(args.num_train,
                                     min(args.num_train, args.num_dev),
                                     replace=False))
                sample_train = gen_examples(
                    [train_x1[k]
                     for k in samples], [train_x2[k] for k in samples],
                    [train_x3[k * 4 + o] for k in samples
                     for o in range(4)], [train_y[k] for k in samples],
                    [train_x4[k]
                     for k in samples], args.batch_size, args.concat)
                acc, pred = eval_acc(test_fn, sample_train)
                logging.info('Train accuracy: %.2f %%' % acc)
                train_acc, pred = eval_acc(test_fn, all_train)
                logging.info('train accuracy: %.2f %%' % train_acc)
                dev_acc, pred = eval_acc(test_fn, all_dev)
                logging.info('Dev accuracy: %.2f %%' % dev_acc)
                if dev_acc > best_dev_acc:
                    best_dev_acc = dev_acc
                    logging.info(
                        'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                        % (epoch, n_updates, best_dev_acc))
                    best_train_acc = acc
                    logging.info(
                        'Best train accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                        % (epoch, n_updates, best_train_acc))
                    utils.save_params(
                        args.model_file,
                        all_params,
                        epoch=epoch,
                        n_updates=n_updates,
                    )

    return best_dev_acc, best_train_acc
コード例 #3
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')
    question_belong = []
    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         100,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       100,
                                       relabeling=args.relabeling,
                                       question_belong=question_belong)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       args.max_dev,
                                       relabeling=args.relabeling,
                                       question_belong=question_belong)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    #word_dict = utils.build_dict(train_examples[0] + train_examples[1] + train_examples[2], args.max_vocab_size)
    word_dict = pickle.load(open("../obj/dict.pkl", "rb"))
    logging.info('-' * 50)
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)  # EMBEDDING
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params, all_params = build_fn(args, embeddings)
    logging.info('Done.')
    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_x3, dev_y = utils.vectorize(
        dev_examples,
        word_dict,
        sort_by_len=not args.test_only,
        concat=args.concat)
    word_dict_r = {}
    word_dict_r[0] = "unk"
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, args.batch_size,
                           args.concat)
    dev_acc, pred = eval_acc(test_fn, all_dev)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc
    if args.test_only:
        return
    utils.save_params(args.model_file, all_params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_x3, train_y = utils.vectorize(train_examples,
                                                            word_dict,
                                                            concat=args.concat)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0

    all_train = gen_examples(train_x1, train_x2, train_x3, train_y,
                             args.batch_size, args.concat)
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3,
                  mb_y) in enumerate(all_train):

            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3,
                                  mb_mask3, mb_y)
            if idx % 100 == 0:
                logging.info('#Examples = %d, max_len = %d' %
                             (len(mb_x1), mb_x1.shape[1]))
                logging.info(
                    'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)'
                    % (epoch, idx, len(all_train), train_loss,
                       time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(
                    np.random.choice(args.num_train,
                                     min(args.num_train, args.num_dev),
                                     replace=False))
                sample_train = gen_examples(
                    [train_x1[k]
                     for k in samples], [train_x2[k] for k in samples],
                    [train_x3[k * 4 + o] for k in samples
                     for o in range(4)], [train_y[k] for k in samples],
                    args.batch_size, args.concat)
                acc, pred = eval_acc(test_fn, sample_train)
                logging.info('Train accuracy: %.2f %%' % acc)
                dev_acc, pred = eval_acc(test_fn, all_dev)
                logging.info('Dev accuracy: %.2f %%' % dev_acc)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info(
                        'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                        % (epoch, n_updates, dev_acc))
                    utils.save_params(args.model_file,
                                      all_params,
                                      epoch=epoch,
                                      n_updates=n_updates)
コード例 #4
0
ファイル: main.py プロジェクト: BinbinBian/rc-cnn-dailymail
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')

    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1])
    entity_markers = list(set([w for w in word_dict.keys()
                              if w.startswith('@entity')] + train_examples[2]))
    entity_markers = ['<unk_entity>'] + entity_markers
    entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
    logging.info('Entity markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    # Load embedding file
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')

    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict)
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)
    dev_acc = eval_acc(test_fn, all_dev)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    if args.test_only:
        return

    utils.save_params(args.model_file, params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_l, train_y = utils.vectorize(train_examples, word_dict, entity_dict)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0

    all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size)
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train):
            logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y)
            logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' %
                         (epoch, idx, len(all_train), train_loss, time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev),
                                                  replace=False))
                sample_train = gen_examples([train_x1[k] for k in samples],
                                            [train_x2[k] for k in samples],
                                            train_l[samples],
                                            [train_y[k] for k in samples],
                                            args.batch_size)
                logging.info('Train accuracy: %.2f %%' % eval_acc(test_fn, sample_train))
                logging.info('Dev accuracy: %.2f %%' % eval_acc(test_fn, all_dev))
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                                 % (epoch, n_updates, dev_acc))
                    utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
コード例 #5
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')

    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         100,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       100,
                                       relabeling=args.relabeling)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       args.max_dev,
                                       relabeling=args.relabeling)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1])
    entity_markers = list(
        set([w for w in word_dict.keys() if w.startswith('@entity')] +
            train_examples[2]))
    entity_markers = ['<unk_entity>'] + entity_markers
    entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
    logging.info('Entity markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    # Load embedding file
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')
    if args.prepare_model:
        return train_fn, test_fn, params

    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict,
                                                   entity_dict)
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)
    dev_acc = eval_acc(test_fn, all_dev)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    if args.test_only:
        return

    utils.save_params(args.model_file, params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_l, train_y = utils.vectorize(
        train_examples, word_dict, entity_dict)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0

    all_train = gen_examples(train_x1, train_x2, train_l, train_y,
                             args.batch_size)
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l,
                  mb_y) in enumerate(all_train):
            logging.info('#Examples = %d, max_len = %d' %
                         (len(mb_x1), mb_x1.shape[1]))
            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y)
            logging.info(
                'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)'
                % (epoch, idx, len(all_train), train_loss,
                   time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(
                    np.random.choice(args.num_train,
                                     min(args.num_train, args.num_dev),
                                     replace=False))
                sample_train = gen_examples([train_x1[k] for k in samples],
                                            [train_x2[k] for k in samples],
                                            train_l[samples],
                                            [train_y[k] for k in samples],
                                            args.batch_size)
                logging.info('Train accuracy: %.2f %%' %
                             eval_acc(test_fn, sample_train))
                dev_acc = eval_acc(test_fn, all_dev)
                logging.info('Dev accuracy: %.2f %%' % dev_acc)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info(
                        'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                        % (epoch, n_updates, dev_acc))
                    utils.save_params(args.model_file,
                                      params,
                                      epoch=epoch,
                                      n_updates=n_updates)
コード例 #6
0
def main(args):
    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = pickle.load(open("../../obj/dict.pkl", "rb"))
    logging.info('-' * 50)
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape

    logging.info('-' * 50)
    logging.info('Load data files..')
    best_dev_acc_vals = []
    best_train_acc_vals = []
    best_all_acc_vals = []
    best_n_updates_vals = []
    for val_id in range(args.cross_val):
        logging.info('Compile functions..')
        train_fn, test_fn, params, all_params = build_fn(args, embeddings)
        logging.info('Done.')
        logging.info('-' * 50)
        logging.info(args)
        if not (args.test_only):
            logging.info('*' * 10 + ' All')
            all_examples = utils.load_data(args.all_file,
                                           args,
                                           relabeling=args.relabeling)
            sample_index = np.arange(len(all_examples[0]))
            #            dev_ratio = args.dev_ratio
            #            random.seed(args.random_seed)
            #            dev_index= random.sample(sample_index, int(dev_ratio*len(sample_index)))
            val_sample_num = len(sample_index) * (1. / args.cross_val)
            if (val_id + 1) == args.cross_val:
                dev_index = sample_index[int(val_id * val_sample_num):]
            else:
                dev_index = sample_index[int(val_id * val_sample_num):int(
                    (val_id + 1) * val_sample_num)]
            train_index = np.setdiff1d(sample_index, dev_index)
            dev_examples = tuple_part(all_examples, dev_index)
            train_examples = tuple_part(all_examples, train_index)
            #feature preprocessing
            train_fea_flat_np = FeaExtract(train_examples[-1])
            dev_fea_flat_np = FeaExtract(dev_examples[-1])
            train_fea_flat_np2, dev_fea_flat_np2 = Prepocessing_func(
                train_fea_flat_np, dev_fea_flat_np, args)
            #            train_fea_flat_np2 = train_fea_flat_np
            #            dev_fea_flat_np2 = dev_fea_flat_np
            train_fea_merge = FeaMerge(train_fea_flat_np2, train_examples[-1])
            dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1])
            train_examples = train_examples[:-1] + (train_fea_merge, )
            dev_examples = dev_examples[:-1] + (dev_fea_merge, )
            args.num_train = len(train_examples[0])
        else:
            logging.info('*' * 10 + ' Dev')
            dev_examples = utils.load_data(args.dev_file,
                                           args,
                                           args.max_dev,
                                           relabeling=args.relabeling)
            dev_fea_flat_np = FeaExtract(dev_examples[-1])
            dev_fea_flat_np2 = PrepocessingApply_func(dev_fea_flat_np, args)
            dev_fea_flat_np2 = dev_fea_flat_np
            dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1])
            dev_examples = dev_examples[:-1] + (dev_fea_merge, )
        args.num_dev = len(dev_examples[0])

        logging.info('-' * 50)
        logging.info('Intial test..')
        dev_x1, dev_x2, dev_x3, dev_y, dev_x4 = utils.vectorize(
            dev_examples,
            word_dict,
            sort_by_len=not args.test_only,
            concat=args.concat)
        word_dict_r = {}
        word_dict_r[0] = "unk"
        assert len(dev_x1) == args.num_dev
        all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, dev_x4,
                               args.batch_size, args.concat)
        dev_acc, _, prediction, test_probs, weights, _ = eval_acc(
            test_fn, all_dev)
        logging.info('Dev accuracy: %.2f %%' % dev_acc.mean())
        print(dev_acc.mean())

        best_dev_acc = dev_acc
        best_train_acc = 0
        best_all_acc = 0
        if args.test_only:
            best_dev_acc_vals.append(best_dev_acc)
            best_train_acc_vals.append(best_train_acc)
            best_all_acc_vals.append(best_all_acc)
            best_n_updates_vals.append(0)
            return best_dev_acc_vals, best_train_acc_vals, best_all_acc_vals, best_n_updates_vals
        utils.save_params(args.model_file, all_params, epoch=0, n_updates=0)
        # Training
        logging.info('-' * 50)
        logging.info('Start training..')
        train_x1, train_x2, train_x3, train_y, train_x4 = utils.vectorize(
            train_examples, word_dict, concat=args.concat)
        assert len(train_x1) == args.num_train
        start_time = time.time()
        n_updates = 0

        all_train = gen_examples(train_x1, train_x2, train_x3, train_y,
                                 train_x4, args.batch_size, args.concat)

        ini_train_acc, ini_train_label, _, ini_train_probs, train_weight, train_loss = eval_acc(
            test_fn, all_train)
        logging.info('initial train accuracy: acc = %.2f %%' % (ini_train_acc))
        pickle.dump(
            {
                'train_acc': ini_train_acc,
                'train_label': ini_train_label,
                'train_probs': ini_train_probs
            }, open('ini.pickle', 'wb'))

        ini_dev_acc, _, _, ini_dev_probs, _, _ = eval_acc(test_fn, all_dev)
        logging.info('initial dev accuracy: acc = %.2f %%' % (ini_dev_acc))
        ini_all_acc, _, _, ini_all_probs, _, _ = eval_acc(
            test_fn, all_train + all_dev)
        logging.info('initial all accuracy: acc = %.2f %%' % (ini_all_acc))
        best_dev_acc = 0
        best_n_updates = n_updates + 0
        fail_update_num = 0
        break_epoch = False
        loss_curve = [train_loss]
        train_acc_curve = [ini_train_acc]
        dev_acc_curve = [ini_dev_acc]
        weight_curve = []
        weight_curve.append(train_weight)
        para_curve = []
        para_curve.append([x.get_value() for x in params])
        logging.info([x.get_value() for x in params])
        for epoch in range(args.num_epoches):
            if break_epoch:
                fig_train.savefig(args.Type + str(val_id) + '.png')
                break


#            np.random.shuffle(all_train)
            for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y,
                      mb_x4, mb_mask4) in enumerate(all_train):
                #early stopping
                if fail_update_num > args.update_fail_tol:
                    break_epoch = True
                    break

                train_loss = train_fn(mb_x1, mb_mask1, mb_x3, mb_mask3, mb_y,
                                      mb_x4)
                #if idx % 100 == 0:
                #                if n_updates % int(args.eval_iter/4) == 0:
                #                if n_updates % args.eval_iter == 0:
                if epoch % args.eval_iter == 0:
                    logging.info('#Examples = %d, max_len = %d' %
                                 (len(mb_x1), mb_x1.shape[1]))
                    logging.info(
                        'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)'
                        % (epoch, idx, len(all_train), train_loss,
                           time.time() - start_time))
                n_updates += 1

                #                if n_updates % args.eval_iter == 0:
                if epoch % args.eval_iter == 0:
                    logging.info([x.get_value() for x in params])
                    #print([x.get_value() for x in all_params])
                    samples = sorted(
                        np.random.choice(args.num_train,
                                         min(args.num_train, args.num_dev),
                                         replace=False))
                    sample_train = gen_examples([train_x1[k] for k in samples],
                                                [train_x2[k]
                                                 for k in samples], [
                                                     train_x3[k * 4 + o]
                                                     for k in samples
                                                     for o in range(4)
                                                 ],
                                                [train_y[k] for k in samples],
                                                [train_x4[k] for k in samples],
                                                args.batch_size, args.concat)
                    acc, _, pred, _, _, _ = eval_acc(test_fn, sample_train)
                    #logging.info('Train accuracy: %.2f %%' % acc)
                    train_acc, train_label, pred, train_probs, train_weight, train_loss = eval_acc(
                        test_fn, all_train)
                    logging.info('train accuracy: %.2f %%' % train_acc)
                    dev_acc, _, pred, _, weights, _ = eval_acc(
                        test_fn, all_dev)
                    logging.info('Dev accuracy: %.2f %%' % dev_acc)
                    all_acc, _, _, _, _, _ = eval_acc(test_fn,
                                                      all_train + all_dev)
                    logging.info('All accuracy: %.2f %%' % all_acc)
                    if args.show_loss:
                        loss_curve.append(train_loss)
                        train_acc_curve.append(train_acc)
                        dev_acc_curve.append(dev_acc)
                        if (epoch % (20 * args.eval_iter) == 0) & (idx == 0):
                            fig_train = plt.figure(num='training')
                            plt.clf()
                            plt.subplot(121)
                            plt.plot(-np.array(loss_curve), '.--')
                            plt.title('train_loss')
                            plt.subplot(122)
                            plt.title('accuracy')
                            plt.plot(train_acc_curve, '.--', label='train_acc')
                            plt.plot(dev_acc_curve, '.--', label='dev_acc')
                            plt.legend()
                            plt.pause(1)

                        weight_curve.append(train_weight)
                        para_curve.append([x.get_value() for x in params])
                        if (epoch % (20 * args.eval_iter) == 0) & (idx == 0):
                            fig_train_para = plt.figure(num='para')
                            plt.clf()
                            plt.subplot(121)
                            mlp_w_curve = np.array([
                                para_tmp[0].reshape((-1))
                                for para_tmp in para_curve
                            ])
                            mlp_w_curve0 = np.concatenate(
                                (mlp_w_curve[:, :-5], mlp_w_curve[:, -4:]),
                                axis=1)
                            mlp_w_curve1 = mlp_w_curve[:, -5]
                            plt.plot(mlp_w_curve0, '.--', color='grey')
                            plt.plot(mlp_w_curve1, '.--', color='k')
                            mlp_b_curve = np.array([
                                para_tmp[1].reshape((-1))
                                for para_tmp in para_curve
                            ])
                            plt.plot(mlp_b_curve, '.--', color='r')
                            plt.title('mlp_w')

                            plt.subplot(122)
                            word_weight = np.array([
                                np.array(weight_tmp[0]).reshape((-1))
                                for weight_tmp in weight_curve
                            ])
                            plt.plot(word_weight, '.--')
                            plt.title('word_weight')
                            plt.pause(1)

                    #print(weights[0])
                    if dev_acc > best_dev_acc:
                        fail_update_num = 0
                        best_n_updates = n_updates + 0
                        best_train_acc = train_acc
                        logging.info(
                            'Best train accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                            % (epoch, n_updates, best_train_acc))
                        best_dev_acc = dev_acc
                        logging.info(
                            'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                            % (epoch, n_updates, best_dev_acc))
                        best_all_acc = all_acc
                        logging.info(
                            'Best all accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                            % (epoch, n_updates, best_all_acc))
                        utils.save_params(
                            args.model_file,
                            all_params,
                            epoch=epoch,
                            n_updates=n_updates,
                        )
                    else:
                        #updates that dev accuracy does not increase
                        fail_update_num = fail_update_num + 1
                    logging.info('failed updates: ' + str(fail_update_num))
                    logging.info('*' * 50)

        best_dev_acc_vals.append(best_dev_acc)
        best_train_acc_vals.append(best_train_acc)
        best_all_acc_vals.append(best_all_acc)
        best_n_updates_vals.append(best_n_updates)
        fig_train.savefig(args.Type + str(val_id) + '.png')
        fig_train_para.savefig(args.Type + str(val_id) + '_para.png')

        pickle.dump(
            {
                'loss_curve': loss_curve,
                'train_acc_curve': train_acc_curve,
                'dev_acc_curve': dev_acc_curve,
                'weight_curve': weight_curve,
                'para_curve': para_curve
            }, open('final.pickle', 'wb'))
    return best_dev_acc_vals, best_train_acc_vals, best_all_acc_vals, best_n_updates_vals
コード例 #7
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')

    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, 5, relabeling=args.relabeling,
                                         remove_notfound=args.remove_notfound)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling,
                                       remove_notfound=False)
    #elif args.test_only:
    #    logging.info('*' * 10 + ' Train')
    #    #train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling)  # docs, qs, ans
    #    train_examples = utils.load_data(args.train_file, relabeling=args.relabeling, remove_notfound=args.remove_notfound)  # docs, qs, ans
    #    logging.info('*' * 10 + ' Dev')
    #    dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling,
    #                                   remove_notfound=False)
    elif args.cnn_train:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling, has_ids=args.train_has_ids)  # docs, qs, ans
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_cnn_data(args.dev_file, args.max_dev, relabeling=args.relabeling, has_ids=args.dev_has_ids)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, relabeling=args.relabeling,
                                         remove_notfound=args.remove_notfound)  # docs, qs, ans
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling,
                                       remove_notfound=False)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1],  # + dev_examples[0] + dev_examples[1],
                                 max_words=args.max_words)  # docs+qs
    entity_markers = list(set([w for w in word_dict.keys()
                              if w.startswith('@entity')] + train_examples[2]))
    entity_markers = ['<unk_entity>'] + entity_markers
    entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
    inv_entity_dict = {index: w for w, index in entity_dict.items()}
    assert len(entity_dict) == len(inv_entity_dict)
    logging.info('Entity markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    # Load embedding file
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')

    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_l, dev_y, dev_ids = utils.vectorize(dev_examples, word_dict, entity_dict,
                                                   remove_notfound=False,
                                                   relabeling=args.relabeling)
    if dev_ids is not None:
        assert len(dev_y) == len(dev_ids)
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)
    dev_acc, dev_preds = eval_acc(test_fn, all_dev)

    if dev_ids is not None:
        assert len(dev_ids) == len(dev_preds) == len(dev_y)
        dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    if args.log_file is not None:
        assert args.log_file.endswith(".log")
        run_name = args.log_file[:args.log_file.find(".log")]
        if dev_ids is not None:
            preds_file_name = run_name + ".preds"
            utils.write_preds(dev_preds_data, preds_file_name)
            utils.external_eval(preds_file_name,
                                run_name + ".preds.scores",
                                eval_data="test" if "test" in os.path.basename(args.dev_file) else "dev")
    if args.test_only:
        return

    if args.log_file is not None:
        utils.save_params(run_name + ".model", params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_l, train_y, train_ids = utils.vectorize(train_examples, word_dict, entity_dict,
                                                           remove_notfound=args.remove_notfound,
                                                           relabeling=args.relabeling)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0
    train_accs = []
    dev_accs = []
    all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size)
    improved = []
    for epoch in range(args.num_epoches):
        ep_acc_improved = False
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train):
            logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y)
            logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' %
                         (epoch, idx, len(all_train), train_loss, time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev),
                                                  replace=False))
                sample_train = gen_examples([train_x1[k] for k in samples],
                                            [train_x2[k] for k in samples],
                                            train_l[samples],
                                            [train_y[k] for k in samples],
                                            args.batch_size)
                train_acc, train_preds = eval_acc(test_fn, sample_train)
                train_accs.append(train_acc)
                logging.info('Train accuracy: %.2f %%' % train_acc)
                dev_acc, dev_preds = eval_acc(test_fn, all_dev)
                dev_accs.append(dev_acc)
                logging.info('Dev accuracy: %.2f %%' % dev_acc)
                utils.update_plot(args.eval_iter, train_accs, dev_accs, file_name=args.log_file + ".html")
                if dev_acc > best_acc:
                    ep_acc_improved = True
                    best_acc = dev_acc
                    logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                                 % (epoch, n_updates, dev_acc))
                    if args.log_file is not None:
                        utils.save_params(run_name + ".model", params, epoch=epoch, n_updates=n_updates)
                        if dev_ids is not None:
                            dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling)
                            utils.write_preds(dev_preds_data, preds_file_name)
                            utils.external_eval(preds_file_name, run_name + ".preds.scores", eval_data="dev")
        improved.append(ep_acc_improved)
        # early stop
        if len(improved) > 25 and sum(improved[-3:]) == 0:
            break
コード例 #8
0
ファイル: main.py プロジェクト: pvk444/attentive-reader
def main(args):
    logging.info('-' * 50 + '')
    logging.info('Loading data...')
    if args.debug:
        train_examples = utils.load_data(args.train_file, 100)
        dev_examples = utils.load_data(args.dev_file, 100)
    else:
        train_examples = utils.load_data(args.train_file)
        dev_examples = utils.load_data(args.dev_file)

    args.num_train = len(train_examples[1])
    args.num_dev = len(dev_examples[1])

    logging.info('-' * 50)
    logging.info('Building dictionary...')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1])
    entity_markers = list(
        set([w for w in word_dict.keys() if w.startswith('@entity')] +
            train_examples[2]))
    entity_markers = ['<entity_unk>'] + entity_markers
    entity_dict = {w: i for (i, w) in enumerate(entity_markers)}
    logging.info('# of Entity Markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    logging.info('Generating embedding...')
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)
    embeddings = embeddings.astype('float32')
    args.vocab_size, args.embedding_size = embeddings.shape

    logging.info('-' * 50)
    logging.info('Creating TF computation graph...')

    if args.rnn_type == 'lstm':
        logging.info('Using LSTM Cells')
    elif args.rnn_type == 'gru':
        logging.info('Using GRU Cells')

    # tf.reset_default_graph()
    d_input = tf.placeholder(dtype=tf.int32,
                             shape=(None, None),
                             name="d_input")
    q_input = tf.placeholder(
        dtype=tf.int32, shape=(None, None),
        name="q_input")  # [batch_size, max_seq_length_for_batch]
    l_mask = tf.placeholder(dtype=tf.float32,
                            shape=(None, None),
                            name="l_mask")  # [batch_size, entity num]
    y = tf.placeholder(dtype=tf.int32, shape=None,
                       name="label")  # batch size vector
    y_1hot = tf.placeholder(
        dtype=tf.float32, shape=(None, None),
        name="label_1hot")  # onehot encoding of y [batch_size, entitydict]
    training = tf.placeholder(dtype=tf.bool)

    word_embeddings = tf.get_variable(
        "glove",
        shape=(args.vocab_size, args.embedding_size),
        initializer=tf.constant_initializer(embeddings))

    W_bilinear = tf.Variable(
        tf.random_uniform((2 * args.hidden_size, 2 * args.hidden_size),
                          minval=-0.01,
                          maxval=0.01))

    with tf.variable_scope(
            'd_encoder'):  # Encoding Step for Passage (d_ for document)
        d_embed = tf.nn.embedding_lookup(
            word_embeddings, d_input
        )  # Apply embeddings: [batch, max passage length in batch, GloVe Dim]
        d_embed_dropout = tf.layers.dropout(
            d_embed, rate=args.dropout_rate,
            training=training)  # Apply Dropout to embedding layer
        if args.rnn_type == 'lstm':
            d_cell_fw = rnn.LSTMCell(args.hidden_size)
            d_cell_bw = rnn.LSTMCell(args.hidden_size)
        elif args.rnn_type == 'gru':
            d_cell_fw = rnn.GRUCell(
                args.hidden_size
            )  # TODO: kernel_initializer=tf.random_normal_initializer(0,0.1) not working for 1.1
            d_cell_bw = rnn.GRUCell(args.hidden_size)

        d_outputs, _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw,
                                                       d_cell_bw,
                                                       d_embed_dropout,
                                                       dtype=tf.float32)
        d_output = tf.concat(
            d_outputs, axis=-1
        )  # [batch, len, h], len is the max passage length, and h is the hidden size

    with tf.variable_scope('q_encoder'):  # Encoding Step for Question
        q_embed = tf.nn.embedding_lookup(word_embeddings, q_input)
        q_embed_dropout = tf.layers.dropout(q_embed,
                                            rate=args.dropout_rate,
                                            training=training)
        if args.rnn_type == 'lstm':
            q_cell_fw = rnn.LSTMCell(args.hidden_size)
            q_cell_bw = rnn.LSTMCell(args.hidden_size)
        elif args.rnn_type == 'gru':
            q_cell_fw = rnn.GRUCell(args.hidden_size)
            q_cell_bw = rnn.GRUCell(args.hidden_size)
        q_outputs, q_laststates = tf.nn.bidirectional_dynamic_rnn(
            q_cell_fw, q_cell_bw, q_embed_dropout, dtype=tf.float32)
        if args.rnn_type == 'lstm':
            q_output = tf.concat([q_laststates[0][-1], q_laststates[1][-1]],
                                 axis=-1)  # (batch, h)
        elif args.rnn_type == 'gru':
            q_output = tf.concat(q_laststates, axis=-1)  # (batch, h)

    with tf.variable_scope('bilinear'):  # Bilinear Layer (Attention Step)
        # M computes the similarity between each passage word and the entire question encoding
        M = d_output * tf.expand_dims(tf.matmul(q_output, W_bilinear),
                                      axis=1)  # [batch, h] -> [batch, 1, h]
        # alpha represents the normalized weights representing how relevant the passage word is to the question
        alpha = tf.nn.softmax(tf.reduce_sum(M, axis=2))  # [batch, len]
        # this output contains the weighted combination of all contextual embeddings
        bilinear_output = tf.reduce_sum(d_output *
                                        tf.expand_dims(alpha, axis=2),
                                        axis=1)  # [batch, h]

    with tf.variable_scope('dense'):  # Prediction Step
        # the final output has dimension [batch, entity#], giving the probabilities of an entity being the answer for examples
        final_prob = tf.layers.dense(
            bilinear_output,
            units=args.num_labels,
            activation=tf.nn.softmax,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-0.01, maxval=0.01))  # [batch, entity#]

    pred = final_prob * l_mask  # ignore entities that don't appear in the passage
    train_pred = pred / tf.expand_dims(
        tf.reduce_sum(pred, axis=1),
        axis=1)  # redistribute probabilities ignoring certain labels
    train_pred = tf.clip_by_value(train_pred, 1e-7, 1.0 - 1e-7)

    test_pred = tf.cast(tf.argmax(pred, axis=-1), tf.int32)
    acc = tf.reduce_sum(tf.cast(tf.equal(test_pred, y), tf.int32))

    loss_op = tf.reduce_mean(
        -tf.reduce_sum(y_1hot * tf.log(train_pred), reduction_indices=[1]))
    optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=args.learning_rate)
    train_op = optimizer.minimize(loss_op)
    logging.info('Done!')

    logging.info('-' * 50)
    logging.info('Printing args...')
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Initial Test...')
    dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict,
                                                   entity_dict)
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)

    dev_acc = 0.  # TODO: first dev accuracy displays here
    logging.info('Dev Accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    saver = tf.train.Saver()

    logging.info('-' * 50)
    logging.info('Testing...')
    if args.test_only:
        if args.test_file == None:
            return ValueError("No test file specified")
        test_examples = utils.load_data(args.test_file)
        test_x1, test_x2, test_l, test_y = utils.vectorize(
            test_examples, word_dict, entity_dict)
        all_test = gen_examples(test_x1, test_x2, test_l, test_y,
                                args.batch_size)
        with tf.Session() as sess:
            # saver = tf.train.import_meta_graph(args.model_path + '.meta')
            saver.restore(sess, args.model_path)
            # TODO: which file to restore?

            correct = 0
            n_examples = 0
            for t_x1, t_mask1, t_x2, t_mask2, t_l, t_y in all_test:
                correct += sess.run(acc,
                                    feed_dict={
                                        d_input: t_x1,
                                        q_input: t_x2,
                                        y: t_y,
                                        l_mask: t_l,
                                        training: False
                                    })
                n_examples += len(t_x1)
            test_acc = correct * 100. / n_examples
            logging.info('Test Accuracy: %.2f %%' % test_acc)
        return

    logging.info('-' * 50)
    logging.info('Start training...')
    train_x1, train_x2, train_l, train_y = utils.vectorize(
        train_examples, word_dict, entity_dict)
    all_train = gen_examples(train_x1, train_x2, train_l, train_y,
                             args.batch_size)

    init = tf.global_variables_initializer()

    start_time = time.time()
    n_updates = 0
    with tf.Session() as sess:
        sess.run(init)
        for e in range(args.num_epoches):
            np.random.shuffle(all_train)
            for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l,
                      mb_y) in enumerate(all_train):
                logging.info(
                    'Batch Size = %d, # of Examples = %d, max_len = %d' %
                    (mb_x1.shape[0], len(mb_x1), mb_x1.shape[1]))

                y_label = np.zeros((mb_x1.shape[0], args.num_labels))
                for r, i in enumerate(
                        mb_y):  # convert (batch) -> (batch, entity_size)
                    y_label[r][i] = 1.

                _, train_loss = sess.run(
                    [train_op, loss_op],
                    feed_dict={
                        d_input: mb_x1,
                        q_input: mb_x2,
                        y_1hot: y_label,
                        l_mask: mb_l,
                        training: True
                    })
                logging.info(
                    'Epoch = %d, Iter = %d (max = %d), Loss = %.2f, Elapsed Time = %.2f (s)'
                    % (e, idx, len(all_train), train_loss,
                       time.time() - start_time))
                n_updates += 1

                if n_updates % args.eval_iter == 0:
                    saver.save(sess, args.model_path, global_step=e)
                    correct = 0
                    n_examples = 0
                    for d_x1, d_mask1, d_x2, d_mask2, d_l, d_y in all_dev:
                        correct += sess.run(acc,
                                            feed_dict={
                                                d_input: d_x1,
                                                q_input: d_x2,
                                                y: d_y,
                                                l_mask: d_l,
                                                training: False
                                            })
                        n_examples += len(d_x1)
                    dev_acc = correct * 100. / n_examples
                    logging.info('Dev Accuracy: %.2f %%' % dev_acc)
                    if dev_acc > best_acc:
                        best_acc = dev_acc
                        logging.info(
                            'Best Dev Accuracy: epoch = %d, n_updates (iter) = %d, acc = %.2f %%'
                            % (e, n_updates, dev_acc))

        logging.info('-' * 50)
        logging.info('Training Finished...')
        logging.info("Model saved in file: %s" %
                     saver.save(sess, args.model_path))
コード例 #9
0
ファイル: newmain.py プロジェクト: cltl/Profiling
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')
    question_belong = []
    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, 100)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, 100)
        test_examples = dev_examples
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file)
        test_examples = utils.load_data(args.test_file)
    args.num_train = len(train_examples)
    args.num_dev = len(dev_examples)
    args.relations = len(train_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat)
    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat)
    default_value = []
    for word_dict in word_dicts:
        default_value.append(word_dict[''])
    #logging.info(word_dicts[1])
    #logging.info(inv_word_dicts[1])

    #utils.store_labels_to_pkl(inv_word_dicts)
    #sys.exit(0)
    args.default_value = default_value
    embeddings = utils.gen_embeddings(word_dicts, args.embedding_size)
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')
    logging.info('-' * 50)
    logging.info(args)

    topk_acc=args.topk_accuracy
    #topk_acc=1

    labels_data=[]
    if args.test_print_allowed:
        labels_data=pickle.load(open(labels_file, 'rb')) 

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_data, dev_mask = utils.vectorize(dev_examples, word_dicts, args)
    all_dev = gen_examples(dev_data, dev_mask, args.batch_size)
    dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc)
    logging.info('Dev accuracy: %s %%' % str(dev_acc))
    test_data, test_mask = utils.vectorize(test_examples, word_dicts, args, args.test_print_allowed, labels_data)
    all_test = gen_examples(test_data, test_mask, args.batch_size)
    test_acc = eval_acc(test_fn, all_test, inv_word_dicts, topk_acc, args.test_print_allowed, labels_data)
    logging.info('Test accuracy: %s %%' % str(test_acc))
    best_acc = dev_acc
    if args.test_only:
        return
    utils.save_params(args.model_file, params, epoch=0, n_updates=0)

    #utils.store_labels_to_pkl(inv_word_dicts)
    # Training
    if args.num_epoches>0:
        logging.info('-' * 50)
        logging.info('Start training..')
        train_data, train_mask = utils.vectorize(train_examples, word_dicts, args)
        start_time = time.time()
        n_updates = 0
        all_train_old = gen_examples(train_data, train_mask, args.batch_size)

        all_train=utils.oversample(all_train_old, args)

        no_progress=0
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, inps in enumerate(all_train):
            train_loss = train_fn(*inps)
            if idx % 1000 == 0:
                #logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
                logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time))
            n_updates += 1
            if n_updates % args.eval_iter == 0:
                samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev),
                                                  replace=False))
                train_data_sample = [train_data[j][samples] for j in range(args.relations)]
                train_mask_sample = [train_mask[j][samples] for j in range(args.relations)]
                sample_train = gen_examples(train_data_sample, train_mask_sample, args.batch_size)
                #acc = eval_acc(test_fn, sample_train)
                #logging.info('Train accuracy: %s %%' % str(acc))
                dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc)
                logging.info('Dev accuracy: %s %%' % str(dev_acc))
                #test_acc = eval_acc(test_fn, all_test)
                #logging.info('Test accuracy: %s %%' % str(test_acc))
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info('Best dev accuracy!')
                    utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
                    no_progress=0
                else:
                    no_progress+=1
                    logging.info('Dev accuracy has not improved in the past %d evaluations' % no_progress)
                    if no_progress>=MAX_NO_PROGRESS:
                        logging.info("Reached the limit of stagnation. Exiting now...")
                        sys.exit(0)
コード例 #10
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')
    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, False, 100)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, False, 100)
        test_examples = dev_examples
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, False)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, False)
        test_examples = utils.load_data(args.test_file, False)
    args.num_train = len(train_examples)
    args.num_dev = len(dev_examples)
    #args.relations = len(train_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dicts = pickle.load(
        open('%s/train_dicts.pickle' % args.data_dir, 'rb'))
    inv_word_dicts = pickle.load(
        open('%s/train_inv_dicts.pickle' % args.data_dir, 'rb'))
    default_value = []
    for word_dict in word_dicts:
        default_value.append(word_dict[''])
    #logging.info(word_dicts[1])
    #logging.info(inv_word_dicts[1])

    #utils.store_labels_to_pkl(inv_word_dicts)
    #sys.exit(0)
    args.default_value = default_value
    embeddings = utils.gen_embeddings(word_dicts, args.embedding_size)
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')
    logging.info('-' * 50)
    logging.info(args)

    topk_acc = args.topk_accuracy
    #topk_acc=1

    labels_data = []
    #if args.test_print_allowed:
    #    labels_data=pickle.load(open(labels_file, 'rb'))

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_data, dev_mask = utils.vectorize(dev_examples, word_dicts, args)
    all_dev = gen_examples(dev_data, dev_mask, args.batch_size)
    dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc)
    logging.info('Dev accuracy: %s %%' % str(dev_acc))
    test_data, test_mask = utils.vectorize(test_examples, word_dicts, args,
                                           args.test_print_allowed,
                                           labels_data)
    all_test = gen_examples(test_data, test_mask, args.batch_size)
    test_acc = eval_acc(test_fn, all_test, inv_word_dicts, topk_acc,
                        args.test_print_allowed, labels_data)
    logging.info('Test accuracy: %s %%' % str(test_acc))
    best_acc = dev_acc
    if args.test_only:
        return
    utils.save_params(args.model_file, params, epoch=0, n_updates=0)

    #utils.store_labels_to_pkl(inv_word_dicts)
    # Training
    if args.num_epoches > 0:
        logging.info('-' * 50)
        logging.info('Start training..')
        train_data, train_mask = utils.vectorize(train_examples, word_dicts,
                                                 args)
        start_time = time.time()
        n_updates = 0
        all_train_old = gen_examples(train_data, train_mask, args.batch_size)

        logging.info("start oversampling")
        all_train = []
        all_train_old2 = np.copy(all_train_old)
        for idx, inps in enumerate(all_train_old):
            #tmp_inp=[]
            new_inps = np.copy(inps)
            for i in range(args.relations):
                if inps[args.relations + i].sum() < 1:
                    np.random.shuffle(all_train_old2)
                    #inps[args.relations + i][0] = 1
                    for idx2, inps2 in enumerate(all_train_old2):
                        if inps2[args.relations + i].sum() >= 1:
                            random_index = utils.get_random_example_index(
                                inps2[args.relations + i])
                            my_col = [
                                inps2[j][random_index]
                                for j in range(2 * args.relations)
                            ]
                            new_inps = np.insert(new_inps,
                                                 len(new_inps[i]),
                                                 my_col,
                                                 axis=1)
                            #new_inps[j+args.relations]=np.append(inps[j+args.relations], inps2[j+args.relations][random_index])
                            break
            all_train += [new_inps.astype(np.int32)]
        logging.info("done oversampling")
        #sys.exit(0)
        no_progress = 0
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, inps in enumerate(all_train):
            train_loss = train_fn(*inps)
            if idx % 1000 == 0:
                #logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
                logging.info(
                    'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)'
                    % (epoch, idx, len(all_train), train_loss,
                       time.time() - start_time))
            n_updates += 1
            if n_updates % args.eval_iter == 0:
                samples = sorted(
                    np.random.choice(args.num_train,
                                     min(args.num_train, args.num_dev),
                                     replace=False))
                train_data_sample = [
                    train_data[j][samples] for j in range(args.relations)
                ]
                train_mask_sample = [
                    train_mask[j][samples] for j in range(args.relations)
                ]
                sample_train = gen_examples(train_data_sample,
                                            train_mask_sample, args.batch_size)
                #acc = eval_acc(test_fn, sample_train)
                #logging.info('Train accuracy: %s %%' % str(acc))
                dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc)
                logging.info('Dev accuracy: %s %%' % str(dev_acc))
                #test_acc = eval_acc(test_fn, all_test)
                #logging.info('Test accuracy: %s %%' % str(test_acc))
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info('Best dev accuracy!')
                    utils.save_params(args.model_file,
                                      params,
                                      epoch=epoch,
                                      n_updates=n_updates)
                    no_progress = 0
                else:
                    no_progress += 1
                    logging.info(
                        'Dev accuracy has not improved in the past %d evaluations'
                        % no_progress)
                    if no_progress >= MAX_NO_PROGRESS:
                        logging.info(
                            "Reached the limit of stagnation. Exiting now...")
                        sys.exit(0)
コード例 #11
0
val_file_name = '/Users/yangsun/Desktop/dataset/validation_cnn.txt'
model_path = './model_path'

documents, questions, answers = utils.load_data(file_name, 10)
word_dict = utils.build_dict(documents + questions)

documents_val, questions_val, answers_val = utils.load_data(val_file_name, 100)
word_dict_val = utils.build_dict(documents_val + questions_val)

entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + answers))


entity_markers = ['<unk_entity>'] + entity_markers
entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
num_labels = len(entity_dict)
embeddings = utils.gen_embeddings(word_dict, embedding_size, embedding_file)
vocab_size, embedding_size = embeddings.shape


# tf.reset_default_graph()
d_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="d_input")
q_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="q_input") # [batch_size, max_seq_length_for_batch]
l_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="l_mask") # [batch_size, entity num]


y = tf.placeholder(dtype=tf.int32, shape=None, name="label") # batch size vector
y_1hot= tf.placeholder(dtype=tf.float32, shape=(None, None), name="label_1hot") # onehot encoding of y [batch_size, entitydict]
training = tf.placeholder(dtype=tf.bool)


word_embeddings = tf.get_variable("glove", shape=(vocab_size, embedding_size), initializer=tf.constant_initializer(embeddings))
コード例 #12
0
def main(args):
    with tf.device("/gpu:1"):
        print('-' * 50)
        print('Load data files..')
        if args.debug:
            print('*' * 10 + ' Train')
            train_data = utils.load_data(args, 'train', 100)
            print('*' * 10 + ' Test')
            test_data = utils.load_data(args, 'test', 100)
            print('*' * 10 + ' Dev')
            dev_data = utils.load_data(args, 'dev', 100)
        else:
            print('*' * 10 + ' Train')
            train_data = utils.load_data(args, 'train')
            print('*' * 10 + ' Test')
            test_data = utils.load_data(args, 'test')
            print('*' * 10 + ' Dev')
            dev_data = utils.load_data(args, 'dev')

        print('-' * 50)
        print('Build dictionary..')
        args.word_dict, args.char_dict = utils.build_dict(train_data.data[0] +
                                                          train_data.data[1] +
                                                          test_data.data[0] +
                                                          test_data.data[1] +
                                                          dev_data.data[0] +
                                                          dev_data.data[1])
        print('-' * 50)
        # Load embedding file
        args.embeddings = utils.gen_embeddings(args.word_dict,
                                               args.embedding_size,
                                               args.embedding_file)
        (args.word_voc_size, args.embedding_size) = args.embeddings.shape
        args.char_voc_size = len(args.char_dict)
        train_data.vectorize(args.word_dict, args.char_dict)
        test_data.vectorize(args.word_dict, args.char_dict)

        config_gpu = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)
        config_gpu.gpu_options.allow_growth = True
        with tf.Session(config=config_gpu) as sess:

            model = Model(args)
            trainer = Trainer(args, model)
            evaluator = Evaluator(args, model)

            tf.global_variables_initializer().run()

            timestamp = str(int(time.time()))
            out_dir = os.path.join(args.out_dir, timestamp)
            checkpoint_dir = os.path.join(out_dir, "checkpoints")
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())

            if args.load:
                cpkl = os.path.join(args.out_dir,
                                    "1506152182/checkpoints/model-17000")
                saver.restore(sess, cpkl)
                test_acc = evaluator.get_evaluation(
                    sess, test_data.gen_minbatches(args.batch_size))
                print('Test accuracy: %.2f %%' % test_acc)

                dev_data.vectorize(args.word_dict, args.char_dict)
                answers = evaluator.get_answers(
                    sess, dev_data.gen_minbatches(args.batch_size))
                gen_answer_file(answers)
                # return

            # Training
            print('-' * 50)
            print('Start training..')
            best_acc = 0
            start_time = time.time()
            last_time = start_time
            n_updates = 0
            batch100_time = 0

            for epoch in range(args.num_epoches):
                for idx, batch in enumerate(
                        train_data.gen_minbatches(args.batch_size,
                                                  shuffle=True)):

                    train_loss, train_op = trainer.step(sess, batch)
                    batch_time = time.time() - last_time
                    if idx % 20 == 0:
                        print(
                            'Epoch = %d, iter = %d, loss = %.2f, batch time = %.2f (s)'
                            % (epoch, idx, train_loss, batch_time))

                    n_updates += 1
                    batch100_time = batch100_time + batch_time
                    # Evalution
                    if n_updates % args.eval_iter == 0:
                        print('time pre 100 batches: %.2f (s)' %
                              (batch100_time))
                        batch100_time = 0
                        start_examples = np.random.randint(
                            0,
                            train_data.num_examples - test_data.num_examples)
                        end_examples = start_examples + test_data.num_examples
                        train_acc = evaluator.get_evaluation(
                            sess,
                            train_data.gen_minbatches(args.batch_size,
                                                      start_examples,
                                                      end_examples))
                        print('Epoch = %d, iter = %d, train_acc = %.2f %%' %
                              (epoch, idx, train_acc))

                        test_acc = evaluator.get_evaluation(
                            sess, test_data.gen_minbatches(args.batch_size))
                        print(
                            'Epoch = %d, iter = %d, test_acc = %.2f %%, Best test accuracy: %.2f %%'
                            % (epoch, idx, test_acc, best_acc))

                        if test_acc > best_acc:
                            best_acc = test_acc
                            print(
                                'Best test accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                                % (epoch, n_updates, test_acc))
                            path = saver.save(sess,
                                              checkpoint_prefix,
                                              global_step=n_updates)
                            print(
                                "Saved model checkpoint to {}\n".format(path))
                    last_time = time.time()
コード例 #13
0
def init():
  path = config.data_path
  config.embedding_file = os.path.join(path, config.embedding_file)
  config.train_file = os.path.join(path, config.train_file)
  config.dev_file = os.path.join(path, config.dev_file)
  config.test_file = os.path.join(path, config.test_file)
  
  dim = utils.get_dim(config.embedding_file)
  config.embedding_size = dim

  # Config log
  if config.log_file is None:
    logging.basicConfig(level=logging.DEBUG,
                      format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M')
  else:
    logging.basicConfig(filename=config.log_file,
                      filemode='w', level=logging.DEBUG,
                      format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M')
  # Load data
  logging.info('-' * 50)
  logging.info('Load data files..')
  if config.debug:
    logging.info('*' * 10 + ' Train')
    train_examples = utils.load_data(config.train_file, 1000)
    logging.info('*' * 10 + ' Dev')
    dev_examples = utils.load_data(config.dev_file, 100)
  else:
    logging.info('*' * 10 + ' Train')
    train_examples = utils.load_data(config.train_file)
    logging.info('*' * 10 + ' Dev')
    dev_examples = utils.load_data(config.dev_file)

  config.num_train = len(train_examples[0])
  config.num_dev = len(dev_examples[0])

  # Build dictionary
  logging.info('-' * 50)
  logging.info('Build dictionary..')
  word_dict = utils.build_dict(train_examples[0] + train_examples[1])
  entity_markers = list(set( [w for w in word_dict.keys()
                            if w.startswith('@entity')] + train_examples[2] ))
  entity_markers = ['<unk_entity>'] + entity_markers
  entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
  logging.info('Entity markers: %d' % len(entity_dict))
  config.num_labels = len(entity_dict)

  logging.info('-' * 50)
  logging.info('Load embedding file..')
  embeddings = utils.gen_embeddings(word_dict, config.embedding_size, config.embedding_file)
  (config.vocab_size, config.embedding_size) = embeddings.shape

  # Log parameters
  flags = config.__dict__['__flags']
  flag_str = "\n"
  for k in flags:
    flag_str += "\t%s:\t%s\n" % (k, flags[k])
  logging.info(flag_str)

  # Vectorize test data
  logging.info('-' * 50)
  logging.info('Vectorize test data..')
  # d: document, q: question, a:answer
  # l: whether the entity label occurs in the document
  dev_d, dev_q, dev_l, dev_a = utils.vectorize(dev_examples, word_dict, entity_dict)
  assert len(dev_d) == config.num_dev
  all_dev = utils.gen_examples(dev_d, dev_q, dev_l, dev_a, config.batch_size)

  if config.test_only:
      return embeddings, all_dev, None

  # Vectorize training data
  logging.info('-' * 50)
  logging.info('Vectorize training data..')
  train_d, train_q, train_l, train_a = utils.vectorize(train_examples, word_dict, entity_dict)
  assert len(train_d) == config.num_train
  all_train = utils.gen_examples(train_d, train_q, train_l, train_a, config.batch_size)

  return embeddings, all_dev, all_train