コード例 #1
0
ファイル: train.py プロジェクト: mana-ysh/pos-tagger
def evaluation(model, data_path, pos2id, vocab, sufvocab, args):
    if args.gpu > -1:
        cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = np
    model.make_oov_vector(args.gpu > -1)
    out_path = making_data(data_path, args.window)
    n_data = 0
    n_correct = 0
    sum_loss = xp.zeros((), dtype=xp.float32)
    for tags, contexts in line_iter(out_path, args.minibatch, False):
        batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32)
        batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32)
        if args.lowercase:
            contexts = [[word.lower() for word in context] for context in contexts]
        batch_xs = xp.array([[vocab[word] for word in vocab.check_words(context)] for context in contexts], dtype=xp.int32)
        # maybe inefficient...
        batch_sufs = [[word[-2:] for word in context] for context in contexts]
        batch_sufs = xp.array([[sufvocab[suf] for suf in sufvocab.check_words(sufs)] for sufs in batch_sufs], dtype=xp.int32)
        batch_features = [batch_xs, batch_sufs, batch_caps]
        cur_batch_size = batch_ts.shape[0]
        ys, loss = model(batch_features, batch_ts)
        sum_loss += loss.data * cur_batch_size
        pred_labels = ys.data.argmax(1)
        n_correct += sum(1 for j in range(cur_batch_size)
                         if pred_labels[j] == batch_ts[j])
        n_data += cur_batch_size
    accuracy = float(n_correct / n_data)
    return sum_loss, accuracy
コード例 #2
0
def test(args):
    vocab = Vocab()
    vocab.load(args.vocab)
    vocab.add_special_token()

    pos2id = Vocab()
    pos2id.load(args.poslist)

    if args.gpu > -1:
        cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = np

    model = WordnnTagger.load(args.model)

    out_path = making_data(args.test_path, model.window)

    if args.gpu > -1:
        model.to_gpu()
    model.make_oov_vector(args.gpu > -1)

    # start evaluation
    n_data = 0
    n_correct = 0
    sum_loss = xp.zeros((), dtype=xp.float32)
    start = time.time()
    for tags, contexts in line_iter(out_path, args.minibatch, False):
        batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32)
        batch_xs = xp.array(
            [[vocab[word] for word in vocab.check_words(context)]
             for context in contexts],
            dtype=xp.int32)
        cur_batch_size = batch_ts.shape[0]
        ys, loss = model(batch_xs, batch_ts)
        sum_loss += loss.data * cur_batch_size
        pred_labels = ys.data.argmax(1)
        n_correct += sum(1 for j in range(cur_batch_size)
                         if pred_labels[j] == batch_ts[j])
        n_data += cur_batch_size
    end = time.time()
    accuracy = float(n_correct / n_data)
    print('test loss : {}'.format(sum_loss))
    print('test accuracy : {}'.format(accuracy))
    print('(time to run : {})'.format(end - start))
コード例 #3
0
ファイル: train.py プロジェクト: mana-ysh/pos-tagger
def train(args):
    if args.gpu > -1:
        cuda.get_device(args.gpu).use()
        xp = cuda.cupy
    else:
        xp = np

    if args.log:
        log_dir = args.log
    else:
        log_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '{}_{}'.format(DIR_NAME, datetime.now().strftime('%Y%m%d_%H:%M')))

    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    # setting for logging
    logger = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    log_path = os.path.join(log_dir, 'log')
    file_handler = logging.FileHandler(log_path)
    fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(fmt)
    logger.addHandler(file_handler)

    logger.info('Arguments...')
    for arg, val in vars(args).items():
        logger.info('{} : {}'.format(arg, val))

    logger.info('Loading Vocab...')
    vocab = Vocab()
    vocab.load(args.vocab, args.lowercase)
    vocab.add_special_token()

    sufvocab = Vocab()
    sufvocab.load(args.sufvocab, args.lowercase)
    sufvocab.add_special_token(['s>', '<UNK>'])

    pos2id = Vocab()
    pos2id.load(args.poslist)

    logger.info('preparation for training data...')
    out_path = making_data(args.train_data, args.window)

    model = WordCSnnTagger(args.wembed, args.fembed, args.hidden, len(vocab), len(sufvocab), len(pos2id), args.window, args.objct, args.alpha)
    model.save_model_config(log_dir)

    if args.gpu > -1:
        model.to_gpu()

    opt = getattr(optimizers, args.opt)()
    opt.setup(model)
    opt.add_hook(optimizer.GradientClipping(args.gclip))
    opt.add_hook(optimizer.WeightDecay(args.wdecay))

    for epoch in range(args.epoch):
        logger.info('START epoch {}/{}'.format(epoch + 1, args.epoch))
        start = time.time()
        sum_loss = xp.zeros((), dtype=xp.float32)
        n_data = 0
        n_correct = 0
        for i, [tags, contexts] in enumerate(line_iter(out_path, args.minibatch)):
            batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32)
            batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32)
            if args.lowercase:
                contexts = [[word.lower() for word in context] for context in contexts]
            batch_xs = xp.array([[vocab[word] for word in context] for context in contexts], dtype=xp.int32)
            batch_sufs = xp.array([[sufvocab[word[-2:]] for word in context] for context in contexts], dtype=xp.int32)
            batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32)
            batch_features = [batch_xs, batch_sufs, batch_caps]
            cur_batch_size = batch_ts.shape[0]
            ys, loss = model(batch_features, batch_ts)
            sum_loss += loss.data * cur_batch_size
            model.zerograds()
            loss.backward()
            opt.update()
            pred_labels = ys.data.argmax(1)
            n_correct += sum(1 for j in range(cur_batch_size) if pred_labels[j] == batch_ts[j])
            n_data += cur_batch_size
            logger.info('done {} batches'.format(i + 1))
        logger.info('{} epoch train loss = {}'.format(epoch + 1, sum_loss))
        logger.info('{} epoch train accuracy = {}'.format(epoch + 1, float(n_correct / n_data)))
        logger.info('{} sec for training per epoch'.format(time.time() - start))

        if args.valid_data:
            start = time.time()
            valid_loss, valid_accuracy = evaluation(model, args.valid_data, pos2id, vocab, sufvocab, args)
            logger.info('{} epoch valid loss = {}'.format(epoch + 1, valid_loss))
            logger.info('{} epoch valid accuracy = {}'.format(epoch + 1, valid_accuracy))
            logger.info('{} sec for validation per epoch'.format(time.time() - start))

        if args.test_data:
            start = time.time()
            test_loss, test_accuracy = evaluation(model, args.test_data, pos2id, vocab, sufvocab, args)
            logger.info('{} epoch test loss = {}'.format(epoch + 1, test_loss))
            logger.info('{} epoch test accuracy = {}'.format(epoch + 1, test_accuracy))
            logger.info('{} sec for testing per epoch'.format(time.time() - start))

        logger.info('serializing...')
        prefix = '{}_{}ep_{}wembed_{}fembed_{}hidden_{}window_{}minibatch_{}opt'.format(DIR_NAME, epoch + 1, args.wembed, args.fembed, args.hidden, args.window, args.minibatch, args.opt)
        model_path = os.path.join(log_dir, prefix + '.model')
        model.save(model_path)

    logger.info('done training')