Esempio n. 1
0
def main(unused_argv):
    vocab = dataset.Vocab(FLAGS.vocab_path, 200000)
    # Check for presence of required special tokens.
    assert vocab.tokenToId(dataset.PAD_TOKEN) > 0
    assert vocab.tokenToId(dataset.UNKNOWN_TOKEN) > 0
    assert vocab.tokenToId(dataset.SENTENCE_START) > 0
    assert vocab.tokenToId(dataset.SENTENCE_END) > 0
    assert vocab.tokenToId(dataset.WORD_BEGIN) > 0
    assert vocab.tokenToId(dataset.WORD_CONTINUE) > 0
    assert vocab.tokenToId(dataset.WORD_END) > 0

    params = selector.parameters(
        mode=FLAGS.mode,  # train, eval, decode
        min_lr=0.01,  # min learning rate.
        lr=0.1,  # learning rate
        batch_size=1,
        c_timesteps=600,  # context length
        q_timesteps=30,  # question length
        min_input_len=2,  # discard context, question < than this words
        hidden_size=200,  # for rnn cell and embedding
        emb_size=200,  # If 0, don't use embedding
        max_decode_steps=4,
        maxout_size=32,
        max_grad_norm=2)

    batcher = batch_reader.Generator(FLAGS.data_path,
                                     vocab,
                                     params,
                                     FLAGS.context_key,
                                     FLAGS.question_key,
                                     FLAGS.answer_key,
                                     FLAGS.max_context_sentences,
                                     FLAGS.max_question_sentences,
                                     bucketing=FLAGS.use_bucketing,
                                     truncate_input=FLAGS.truncate_input)

    tf.set_random_seed(FLAGS.random_seed)

    if params.mode == 'train':
        model = selector.Model(params,
                               len(vocab),
                               num_cpus=FLAGS.num_cpus,
                               num_gpus=FLAGS.num_gpus)
        _train(model, batcher)
    elif params.mode == 'eval':
        model = selector.Model(params,
                               len(vocab),
                               num_cpus=FLAGS.num_cpus,
                               num_gpus=FLAGS.num_gpus)
        _eval(model, batcher)
    elif params.mode == 'decode':
        model = selector.Model(params,
                               len(vocab),
                               num_cpus=FLAGS.num_cpus,
                               num_gpus=FLAGS.num_gpus)
        machine = decoder.Decoder(model, batcher, params, vocab)
        machine.loop()
Esempio n. 2
0
def main():
    args = parse_args()
    model_dir = args.model_dir
    """LOAD CONFIG FILE"""
    config_files = glob.glob(os.path.join(model_dir, '*.ini'))
    assert len(config_files) == 1, 'Put only one config file in the directory'
    config_file = config_files[0]
    config = configparser.ConfigParser()
    config.read(config_file)
    """LOGGER"""
    logger = getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')

    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    log_file = model_dir + 'log.txt'
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info('Training start')
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    dropout = float(config['Parameter']['dropout'])
    weightdecay = float(config['Parameter']['weightdecay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_size = int(config['Parameter']['vocab_size'])
    label_size = int(config['Parameter']['label_size'])
    """TRINING DETAIL"""
    gpu_id = args.gpu
    n_epoch = args.epoch
    batch_size = args.batch
    interval = args.interval
    """DATASET"""
    train_data_file = config['Dataset']['train_data_file']
    valid_data_file = config['Dataset']['valid_data_file']
    test_data_file = config['Dataset']['test_data_file']

    train_text, train_label = dataset.load(train_data_file)
    valid_text, valid_label = dataset.load(valid_data_file)
    test_text, test_label = dataset.load(test_data_file)

    logger.info('train size: {}, valid size: {}, test size{}'.format(
        len(train_text), len(valid_text), len(test_text)))

    vocab = dataset.Vocab()
    vocab.build(train_text, vocab_size)
    dataset.save_pickle(model_dir + 'vocab.pkl', vocab.vocab)
    train_vocab_size = len(vocab.vocab)

    label_dic = dataset.Label()
    label_dic.build(train_label, label_size)
    dataset.save_pickle(model_dir + 'label.pkl', label_dic.dic)
    train_label_size = len(label_dic.dic)

    logger.info('vocab size: {}, label size: {}'.format(
        train_vocab_size, train_label_size))

    train_iter = dataset.Iterator(train_text,
                                  train_label,
                                  vocab,
                                  label_dic,
                                  batch_size,
                                  sort=True,
                                  shuffle=True)
    # train_iter = dataset.Iterator(train_text, train_label, vocab, label_dic, batch_size, sort=False, shuffle=False)
    valid_iter = dataset.Iterator(valid_text,
                                  valid_label,
                                  vocab,
                                  label_dic,
                                  batch_size,
                                  sort=False,
                                  shuffle=False)
    test_iter = dataset.Iterator(test_text,
                                 test_label,
                                 vocab,
                                 label_dic,
                                 batch_size,
                                 sort=False,
                                 shuffle=False)
    """MODEL"""
    model = Tweet2vec(train_vocab_size, embed_size, hidden_size, dropout,
                      train_label_size)
    """OPTIMIZER"""
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(weightdecay))
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """TRAIN"""
    sum_loss = 0
    loss_dic = {}
    for epoch in range(1, n_epoch + 1):
        for i, batch in enumerate(train_iter(), start=1):
            batch = dataset.converter(batch, gpu_id)
            loss = optimizer.target(*batch)
            sum_loss += loss.data
            optimizer.target.cleargrads()
            loss.backward()
            optimizer.update()

            if i % interval == 0:
                logger.info('E{} ## iteration:{}, loss:{}'.format(
                    epoch, i, sum_loss))
                sum_loss = 0
        chainer.serializers.save_npz(
            model_dir + 'model_epoch{}.npz'.format(epoch), model)
        # chainer.serializers.save_npz(model_dir + 'optimizer_epoch{0}.npz'.format(epoch), optimizer)
        """EVALUATE"""
        valid_loss = 0
        for batch in valid_iter():
            batch = dataset.converter(batch, gpu_id)
            with chainer.no_backprop_mode(), chainer.using_config(
                    'train', False):
                valid_loss += optimizer.target(*batch).data
        logger.info('E{} ## val loss:{}'.format(epoch, valid_loss))
        loss_dic[epoch] = valid_loss
        """TEST"""
        correct = 0
        total = 0
        for batch in test_iter():
            batch = dataset.converter(batch, gpu_id)
            proj_label = model.predict(*batch)
            for proj, gold in zip(proj_label, batch[1]):
                if proj == gold:
                    correct += 1
                total += 1
        logger.info('E{} ## test accuracy:{}'.format(epoch, correct / total))
    """MODEL SAVE"""
    best_epoch = min(loss_dic, key=(lambda x: loss_dic[x]))
    logger.info('best_epoch:{}'.format(best_epoch))
    chainer.serializers.save_npz(model_dir + 'best_model.npz', model)
Esempio n. 3
0
def main():
    args = parse_args()
    model_dir = args.model_dir
    """LOAD CONFIG FILE"""
    config_files = glob.glob(os.path.join(model_dir, '*.ini'))
    assert len(config_files) == 1, 'Put only one config file in the directory'
    config_file = config_files[0]
    config = configparser.ConfigParser()
    config.read(config_file)
    """LOGGER"""
    logger = getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')

    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    log_file = model_dir + 'log.txt'
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info('Training start')
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    dropout = float(config['Parameter']['dropout'])
    """TRINING DETAIL"""
    gpu_id = args.gpu
    batch_size = args.batch
    interval = args.interval
    model_file = args.model
    """DATASET"""
    predict_data_file = config['Dataset']['predict_data_file']
    text, label = dataset.load_none_label(predict_data_file)

    vocab = dataset.Vocab()
    vocab.load(model_dir + 'vocab.pkl')
    vocab_size = len(vocab.vocab)

    label_dic = dataset.Label()
    label_dic.load(model_dir + 'label.pkl')
    label_size = len(label_dic.dic)

    iter = dataset.Iterator(text,
                            label,
                            vocab,
                            label_dic,
                            batch_size,
                            sort=False,
                            shuffle=False)
    """MODEL"""
    model = Tweet2vec(vocab_size, embed_size, hidden_size, dropout, label_size)
    chainer.serializers.load_npz(model_file, model)
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """PREDICT"""
    print(text)
    res = []
    for batch in iter():
        batch = dataset.converter(batch, gpu_id)
        proj_label = model.predict(*batch)
        print(proj_label)
        for p in proj_label:
            res.append(label_dic.reverse_dic[p])
    with open(predict_data_file + '.res', 'w') as f:
        [f.write('{}\t{}\n'.format(r, t)) for r, t in zip(res, text)]