Example #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional RNN-CNN')
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        help='Number of hidden units in RNN')
    parser.add_argument('--tag_space',
                        type=int,
                        default=0,
                        help='Dimension of tag space')
    parser.add_argument('--num_layers',
                        type=int,
                        default=1,
                        help='Number of layers of RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=30,
                        help='Number of filters in CNN')
    parser.add_argument('--char_dim',
                        type=int,
                        default=30,
                        help='Dimension of Character embeddings')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--dropout',
                        choices=['std', 'variational'],
                        help='type of dropout',
                        required=True)
    parser.add_argument('--p_rnn',
                        nargs=2,
                        type=float,
                        required=True,
                        help='dropout rate for RNN')
    parser.add_argument('--p_in',
                        type=float,
                        default=0.33,
                        help='dropout rate for input embeddings')
    parser.add_argument('--p_out',
                        type=float,
                        default=0.33,
                        help='dropout rate for output layer')
    parser.add_argument('--schedule',
                        type=int,
                        help='schedule for learning rate decay')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'sskip', 'polyglot'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("NER")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    embedding = args.embedding
    embedding_path = args.embedding_dict

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, \
    chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner/", train_path, data_paths=[dev_path, test_path],
                                                                 embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = conll03_data.read_data_to_variable(train_path,
                                                    word_alphabet,
                                                    char_alphabet,
                                                    pos_alphabet,
                                                    chunk_alphabet,
                                                    ner_alphabet,
                                                    use_gpu=use_gpu)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data_to_variable(dev_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  pos_alphabet,
                                                  chunk_alphabet,
                                                  ner_alphabet,
                                                  use_gpu=use_gpu,
                                                  volatile=True)
    data_test = conll03_data.read_data_to_variable(test_path,
                                                   word_alphabet,
                                                   char_alphabet,
                                                   pos_alphabet,
                                                   chunk_alphabet,
                                                   ner_alphabet,
                                                   use_gpu=use_gpu,
                                                   volatile=True)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in list(word_alphabet.items()):
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform
    if args.dropout == 'std':
        network = BiRecurrentConv(embedd_dim,
                                  word_alphabet.size(),
                                  char_dim,
                                  char_alphabet.size(),
                                  num_filters,
                                  window,
                                  mode,
                                  hidden_size,
                                  num_layers,
                                  num_labels,
                                  tag_space=tag_space,
                                  embedd_word=word_table,
                                  p_in=p_in,
                                  p_out=p_out,
                                  p_rnn=p_rnn,
                                  initializer=initializer)
    else:
        network = BiVarRecurrentConv(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     num_filters,
                                     window,
                                     mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     tag_space=tag_space,
                                     embedd_word=word_table,
                                     p_in=p_in,
                                     p_out=p_out,
                                     p_rnn=p_rnn,
                                     initializer=initializer)
    if use_gpu:
        network.cuda()

    lr = learning_rate
    # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma)
    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" %
        (mode, num_layers, hidden_size, num_filters, tag_space))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))

    num_batches = num_data / batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, args.dropout, lr, decay_rate, schedule))
        train_err = 0.
        train_corr = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, _, _, labels, masks, lengths = conll03_data.get_batch_variable(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss, corr, _ = network.loss(
                word,
                char,
                labels,
                mask=masks,
                length=lengths,
                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
            loss.backward()
            optim.step()

            num_tokens = masks.data.sum()
            train_err += loss.data[0] * num_tokens
            train_corr += corr.data[0]
            train_total += num_tokens

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total,
                    train_corr * 100 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' %
              (num_batches, train_err / train_total,
               train_corr * 100 / train_total, time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch)
        writer.start(tmp_filename)

        for batch in conll03_data.iterate_batch_variable(data_dev, batch_size):
            word, char, pos, chunk, labels, masks, lengths = batch
            _, _, preds = network.loss(
                word,
                char,
                labels,
                mask=masks,
                length=lengths,
                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
            writer.write(word.data.cpu().numpy(),
                         pos.data.cpu().numpy(),
                         chunk.data.cpu().numpy(),
                         preds.data.cpu().numpy(),
                         labels.data.cpu().numpy(),
                         lengths.cpu().numpy())
        writer.close()
        acc, precision, recall, f1 = evaluate(tmp_filename)
        print(
            'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' %
            (acc, precision, recall, f1))

        if dev_f1 < f1:
            dev_f1 = f1
            dev_acc = acc
            dev_precision = precision
            dev_recall = recall
            best_epoch = epoch

            # evaluate on test data when better performance detected
            tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch)
            writer.start(tmp_filename)

            for batch in conll03_data.iterate_batch_variable(
                    data_test, batch_size):
                word, char, pos, chunk, labels, masks, lengths = batch
                _, _, preds = network.loss(
                    word,
                    char,
                    labels,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.data.cpu().numpy(),
                             pos.data.cpu().numpy(),
                             chunk.data.cpu().numpy(),
                             preds.data.cpu().numpy(),
                             labels.data.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            test_acc, test_precision, test_recall, test_f1 = evaluate(
                tmp_filename)

        print(
            "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
            % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        print(
            "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
            % (test_acc, test_precision, test_recall, test_f1, best_epoch))

        if epoch % schedule == 0:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description='NER with bi-directional RNN-CNN')
    parser.add_argument('--config',
                        type=str,
                        help='config file',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--loss_type',
                        choices=['sentence', 'token'],
                        default='sentence',
                        help='loss type (default: sentence)')
    parser.add_argument('--optim',
                        choices=['sgd', 'adam'],
                        help='type of optimizer',
                        required=True)
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--lr_decay',
                        type=float,
                        default=0.999995,
                        help='Decay rate of learning rate')
    parser.add_argument('--amsgrad', action='store_true', help='AMS Grad')
    parser.add_argument('--grad_clip',
                        type=float,
                        default=0,
                        help='max norm for gradient clip (default 0: no clip')
    parser.add_argument('--warmup_steps',
                        type=int,
                        default=0,
                        metavar='N',
                        help='number of steps to warm up (default: 0)')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0,
                        help='weight for l2 norm decay')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'sskip', 'polyglot'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--train',
                        help='path for training file.',
                        required=True)
    parser.add_argument('--dev', help='path for dev file.', required=True)
    parser.add_argument('--test', help='path for test file.', required=True)
    parser.add_argument('--model_path',
                        help='path for saving model file.',
                        required=True)

    args = parser.parse_args()

    logger = get_logger("NER")

    args.cuda = torch.cuda.is_available()
    device = torch.device('cuda', 0) if args.cuda else torch.device('cpu')
    train_path = args.train
    dev_path = args.dev
    test_path = args.test

    num_epochs = args.num_epochs
    batch_size = args.batch_size
    optim = args.optim
    learning_rate = args.learning_rate
    lr_decay = args.lr_decay
    amsgrad = args.amsgrad
    warmup_steps = args.warmup_steps
    weight_decay = args.weight_decay
    grad_clip = args.grad_clip

    loss_ty_token = args.loss_type == 'token'
    unk_replace = args.unk_replace

    model_path = args.model_path
    model_name = os.path.join(model_path, 'model.pt')
    embedding = args.embedding
    embedding_path = args.embedding_dict

    print(args)

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(model_path, 'alphabets')
    word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet = conll03_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=[dev_path, test_path],
        embedd_dict=embedd_dict,
        max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")

    data_train = conll03_data.read_bucketed_data(train_path, word_alphabet,
                                                 char_alphabet, pos_alphabet,
                                                 chunk_alphabet, ner_alphabet)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet,
                                      pos_alphabet, chunk_alphabet,
                                      ner_alphabet)
    data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet,
                                       pos_alphabet, chunk_alphabet,
                                       ner_alphabet)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()

    logger.info("constructing network...")

    hyps = json.load(open(args.config, 'r'))
    json.dump(hyps,
              open(os.path.join(model_path, 'config.json'), 'w'),
              indent=2)
    dropout = hyps['dropout']
    crf = hyps['crf']
    bigram = hyps['bigram']
    assert embedd_dim == hyps['embedd_dim']
    char_dim = hyps['char_dim']
    mode = hyps['rnn_mode']
    hidden_size = hyps['hidden_size']
    out_features = hyps['out_features']
    num_layers = hyps['num_layers']
    p_in = hyps['p_in']
    p_out = hyps['p_out']
    p_rnn = hyps['p_rnn']
    activation = hyps['activation']

    if dropout == 'std':
        if crf:
            network = BiRecurrentConvCRF(embedd_dim,
                                         word_alphabet.size(),
                                         char_dim,
                                         char_alphabet.size(),
                                         mode,
                                         hidden_size,
                                         out_features,
                                         num_layers,
                                         num_labels,
                                         embedd_word=word_table,
                                         p_in=p_in,
                                         p_out=p_out,
                                         p_rnn=p_rnn,
                                         bigram=bigram,
                                         activation=activation)
        else:
            network = BiRecurrentConv(embedd_dim,
                                      word_alphabet.size(),
                                      char_dim,
                                      char_alphabet.size(),
                                      mode,
                                      hidden_size,
                                      out_features,
                                      num_layers,
                                      num_labels,
                                      embedd_word=word_table,
                                      p_in=p_in,
                                      p_out=p_out,
                                      p_rnn=p_rnn,
                                      activation=activation)
    elif dropout == 'variational':
        if crf:
            network = BiVarRecurrentConvCRF(embedd_dim,
                                            word_alphabet.size(),
                                            char_dim,
                                            char_alphabet.size(),
                                            mode,
                                            hidden_size,
                                            out_features,
                                            num_layers,
                                            num_labels,
                                            embedd_word=word_table,
                                            p_in=p_in,
                                            p_out=p_out,
                                            p_rnn=p_rnn,
                                            bigram=bigram,
                                            activation=activation)
        else:
            network = BiVarRecurrentConv(embedd_dim,
                                         word_alphabet.size(),
                                         char_dim,
                                         char_alphabet.size(),
                                         mode,
                                         hidden_size,
                                         out_features,
                                         num_layers,
                                         num_labels,
                                         embedd_word=word_table,
                                         p_in=p_in,
                                         p_out=p_out,
                                         p_rnn=p_rnn,
                                         activation=activation)
    else:
        raise ValueError('Unkown dropout type: {}'.format(dropout))

    network = network.to(device)

    optimizer, scheduler = get_optimizer(network.parameters(), optim,
                                         learning_rate, lr_decay, amsgrad,
                                         weight_decay, warmup_steps)
    model = "{}-CNN{}".format(mode, "-CRF" if crf else "")
    logger.info("Network: %s, num_layer=%d, hidden=%d, act=%s" %
                (model, num_layers, hidden_size, activation))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (weight_decay, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): %s(%.2f, %.2f, %s)" %
                (dropout, p_in, p_out, p_rnn))
    print('# of Parameters: %d' %
          (sum([param.numel() for param in network.parameters()])))

    best_f1 = 0.0
    best_acc = 0.0
    best_precision = 0.0
    best_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    patient = 0
    num_batches = num_data // batch_size + 1
    result_path = os.path.join(model_path, 'tmp')
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    for epoch in range(1, num_epochs + 1):
        start_time = time.time()
        train_loss = 0.
        num_insts = 0
        num_words = 0
        num_back = 0
        network.train()
        lr = scheduler.get_lr()[0]
        print('Epoch %d (%s, lr=%.6f, lr decay=%.6f, amsgrad=%s, l2=%.1e): ' %
              (epoch, optim, lr, lr_decay, amsgrad, weight_decay))
        if args.cuda:
            torch.cuda.empty_cache()
        gc.collect()
        for step, data in enumerate(
                iterate_data(data_train,
                             batch_size,
                             bucketed=True,
                             unk_replace=unk_replace,
                             shuffle=True)):
            optimizer.zero_grad()
            words = data['WORD'].to(device)
            chars = data['CHAR'].to(device)
            labels = data['NER'].to(device)
            masks = data['MASK'].to(device)

            nbatch = words.size(0)
            nwords = masks.sum().item()

            loss_total = network.loss(words, chars, labels, mask=masks).sum()
            if loss_ty_token:
                loss = loss_total.div(nwords)
            else:
                loss = loss_total.div(nbatch)
            loss.backward()
            if grad_clip > 0:
                clip_grad_norm_(network.parameters(), grad_clip)
            optimizer.step()
            scheduler.step()

            with torch.no_grad():
                num_insts += nbatch
                num_words += nwords
                train_loss += loss_total.item()

            # update log
            if step % 100 == 0:
                torch.cuda.empty_cache()
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                curr_lr = scheduler.get_lr()[0]
                log_info = '[%d/%d (%.0f%%) lr=%.6f] loss: %.4f (%.4f)' % (
                    step, num_batches, 100. * step / num_batches, curr_lr,
                    train_loss / num_insts, train_loss / num_words)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('total: %d (%d), loss: %.4f (%.4f), time: %.2fs' %
              (num_insts, num_words, train_loss / num_insts,
               train_loss / num_words, time.time() - start_time))
        print('-' * 100)

        # evaluate performance on dev data
        with torch.no_grad():
            outfile = os.path.join(result_path, 'pred_dev%d' % epoch)
            scorefile = os.path.join(result_path, "score_dev%d" % epoch)
            acc, precision, recall, f1 = eval(data_dev, network, writer,
                                              outfile, scorefile, device)
            print(
                'Dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))
            if best_f1 < f1:
                torch.save(network.state_dict(), model_name)
                best_f1 = f1
                best_acc = acc
                best_precision = precision
                best_recall = recall
                best_epoch = epoch

                # evaluate on test data when better performance detected
                outfile = os.path.join(result_path, 'pred_test%d' % epoch)
                scorefile = os.path.join(result_path, "score_test%d" % epoch)
                test_acc, test_precision, test_recall, test_f1 = eval(
                    data_test, network, writer, outfile, scorefile, device)
                print(
                    'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                    % (test_acc, test_precision, test_recall, test_f1))
                patient = 0
            else:
                patient += 1
            print('-' * 100)

            print(
                "Best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))"
                % (best_acc, best_precision, best_recall, best_f1, best_epoch,
                   patient))
            print(
                "Best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))"
                % (test_acc, test_precision, test_recall, test_f1, best_epoch,
                   patient))
            print('=' * 100)

        if patient > 4:
            logger.info('reset optimizer momentums')
            scheduler.reset_state()
            patient = 0
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional RNN-CNN')
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--num_layers',
                        type=int,
                        default=2,
                        help='Number of layers')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--bidirectional', default=True)
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        help='Number of hidden units in RNN')
    parser.add_argument('--tag_space',
                        type=int,
                        default=0,
                        help='Dimension of tag space')
    parser.add_argument('--num_filters',
                        type=int,
                        default=30,
                        help='Number of filters in CNN')
    parser.add_argument('--char_dim',
                        type=int,
                        default=30,
                        help='Dimension of Character embeddings')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--dropout',
                        choices=['std', 'variational'],
                        help='type of dropout',
                        required=True)
    parser.add_argument('--p', type=float, default=0.5, help='dropout rate')
    parser.add_argument('--schedule',
                        type=int,
                        help='schedule for learning rate decay')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'sskip', 'polyglot'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--data_path')
    parser.add_argument('--modelname',
                        default="ASR_ERR_LSTM.json.pth.tar",
                        help='model name')
    parser.add_argument('--task',
                        default="MEDIA",
                        help='task name : MEDIA or ATIS')
    parser.add_argument('--optim',
                        default="SGD",
                        help=' Optimizer : SGD or ADAM')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    tim = datetime.now().strftime("%Y%m%d-%H%M%S")
    log_file = '%s/log/log_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_optim_%s_lr_%f_tag_space_%s.txt' % (
        args.data_path, args.modelname, args.mode, args.num_epochs,
        args.batch_size, args.hidden_size, args.num_layers, args.optim,
        args.learning_rate, str(args.tag_space))
    logger = get_logger("SLU_BLSTM", log_file)

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule
    data_path = args.data_path
    bidirectional = args.bidirectional
    p = args.p
    unk_replace = args.unk_replace
    embedding = args.embedding
    embedding_path = args.embedding_dict
    out_path = args.data_path
    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)
    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, target_alphabet = slu_data.create_alphabets(
        '%s/data_dic' % (data_path),
        train_path,
        data_paths=[dev_path, test_path],
        embedd_dict=embedd_dict,
        max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("Target Alphabet Size: %d" % target_alphabet.size())
    logger.info("Bidirectionnal %s" % bidirectional)

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = slu_data.read_data_to_variable(train_path,
                                                word_alphabet,
                                                char_alphabet,
                                                target_alphabet,
                                                use_gpu=use_gpu)
    num_data = sum(data_train[1])
    num_labels = target_alphabet.size()
    print(" num_labels", num_labels)
    data_dev = slu_data.read_data_to_variable(dev_path,
                                              word_alphabet,
                                              char_alphabet,
                                              target_alphabet,
                                              use_gpu=use_gpu,
                                              volatile=True)
    data_test = slu_data.read_data_to_variable(test_path,
                                               word_alphabet,
                                               char_alphabet,
                                               target_alphabet,
                                               use_gpu=use_gpu,
                                               volatile=True)
    writer = SLUWriter(word_alphabet, char_alphabet, target_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[slu_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    print(" embedd_dim ", embedd_dim)
    if args.dropout == 'std':
        network = BiRecurrentConv2(embedd_dim,
                                   word_alphabet.size(),
                                   char_dim,
                                   char_alphabet.size(),
                                   num_filters,
                                   window,
                                   mode,
                                   hidden_size,
                                   num_layers,
                                   num_labels,
                                   tag_space=tag_space,
                                   embedd_word=word_table,
                                   p_rnn=p,
                                   bidirectional=bidirectional)
    else:
        network = BiVarRecurrentConv(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     num_filters,
                                     window,
                                     mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     tag_space=tag_space,
                                     embedd_word=word_table,
                                     p_rnn=p)
    print(network)
    if use_gpu:
        network.cuda()

    lr = learning_rate
    if args.optim == "SGD":
        optim = SGD(network.parameters(),
                    lr=lr,
                    momentum=momentum,
                    weight_decay=gamma,
                    nesterov=True)
    else:
        optim = Adam(network.parameters(),
                     lr=lr,
                     betas=(0.9, 0.9),
                     weight_decay=gamma)
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" %
        (mode, num_layers, hidden_size, num_filters, tag_space))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, dropout: %.2f, unk replace: %.2f)"
        % (gamma, num_data, batch_size, p, unk_replace))
    num_batches = num_data / batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    model_path = ""
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, args.dropout, lr, decay_rate, schedule))
        train_err = 0.
        train_corr = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            #for batch_train in slu_data.iterate_batch_variable(data_train, batch_size):
            word, char, labels, masks, lengths = slu_data.get_batch_variable(
                data_train, batch_size, unk_replace=unk_replace)
            optim.zero_grad()
            loss, corr, _ = network.loss(
                word,
                char,
                labels,
                mask=masks,
                length=lengths,
                leading_symbolic=slu_data.NUM_SYMBOLIC_TAGS)
            loss.backward()
            optim.step()

            num_tokens = masks.data.sum()
            #train_err += loss.data * num_tokens
            train_err += loss.data[0] * num_tokens
            #train_corr += corr.data
            train_corr += corr.data[0]
            train_total += num_tokens
            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total,
                    train_corr * 100 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)
            batch = batch + 1
        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, time: %.2fs' %
              (num_batches, train_err / train_total, time.time() - start_time))
        logger.info(
            'train: %d loss: %.4f, time: %.2fs' %
            (num_batches, train_err / train_total, time.time() - start_time))
        print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' %
              (num_batches, train_err / train_total,
               train_corr * 100 / train_total, time.time() - start_time))
        loss_results = train_err / train_total
        # evaluate performance on dev data
        network.eval()
        tmp_filename = '%s/predictions/dev_%s_num_layers_%s_%s.txt' % (
            out_path, args.optim, str(args.num_layers), str(uid))
        writer.start(tmp_filename)
        all_target = []
        all_preds = []
        for batch in slu_data.iterate_batch_variable(data_dev, batch_size):
            word, char, labels, masks, lengths = batch
            _, _, preds = network.loss(
                word,
                char,
                labels,
                mask=masks,
                length=lengths,
                leading_symbolic=slu_data.NUM_SYMBOLIC_TAGS)
            writer.write(word.data.cpu().numpy(),
                         preds.data.cpu().numpy(),
                         labels.data.cpu().numpy(),
                         lengths.cpu().numpy())
        #    correct_tag, pred_tag=writer.tensor_to_list(preds.cpu().numpy(),labels.cpu().numpy(), lengths.cpu().numpy())
        #   all_target.extend(correct_tag)
        #  all_preds.extend(pred_tag)
        writer.close()
        # precision, recall,f1,acc=writer.evaluate(all_preds,all_target)
        acc, precision, recall, f1 = evaluate(tmp_filename, data_path, "dev",
                                              args.task, args.optim)
        print(
            'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' %
            (acc, precision, recall, f1))
        logger.info(
            'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' %
            (acc, precision, recall, f1))
        if dev_acc < acc:
            dev_f1 = f1
            dev_acc = acc
            dev_precision = precision
            dev_recall = recall
            best_epoch = epoch

            # save best model
            model_path = "%s/models/best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s" % (
                args.data_path, args.modelname, mode, num_epochs, batch_size,
                hidden_size, args.num_layers, dev_acc, best_epoch, args.optim,
                args.learning_rate, str(tag_space))
            torch.save(network, model_path)

            # evaluate on test data when better performance detected
            """
            tmp_filename = '%s/tmp/%s_test%d' % (data_path,tim, epoch)
            writer.start(tmp_filename)

            for batch in slu_data.iterate_batch_variable(data_test, batch_size):
                word, features, sents, char, labels, masks, lengths = batch
                _, _, preds,probs = network.loss(features, char, labels, mask=masks, length=lengths,
                                              leading_symbolic=slu_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.data.cpu().numpy(),sents.data.cpu().numpy(),
                             preds.data.cpu().numpy(), probs.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy())
            writer.close()
            test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename, data_path,"test",tim)
            """

        logger.info(
            "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
            % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        #        logger.info("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
        #           test_acc, test_precision, test_recall, test_f1, best_epoch))

        if epoch % schedule == 0:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            if args.optim == "SGD":
                optim = SGD(network.parameters(),
                            lr=lr,
                            momentum=momentum,
                            weight_decay=gamma,
                            nesterov=True)
            else:
                optim = Adam(network.parameters(),
                             lr=lr,
                             betas=(0.9, 0.9),
                             weight_decay=gamma)

    # end epoch
    # test evaluation
    # load model
    print("model path ", model_path)
    network = torch.load(model_path)
    if use_gpu:
        network.cuda()
    # mode eval
    network.eval()
    # evaluate on test dev when better performance detected
    tmp_filename = '%s/predictions/dev_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestF1_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % (
        out_path, args.modelname, mode, num_epochs, batch_size, hidden_size,
        num_layers, dev_acc, dev_f1, best_epoch, args.optim,
        args.learning_rate, tag_space)

    #tmp_filename = '%s/predictions/dev_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % (out_path,args.modelname,mode,num_epochs,batch_size,hidden_size,num_layers,dev_acc,best_epoch,args.optim,args.learning_rate,tag_space)    #tmp_filename = '%s/predictions/dev_bestmodel_devacc_%f_epoch_%d' % (out_path,dev_acc, best_epoch)
    writer.start(tmp_filename)
    all_target = []
    all_preds = []
    for batch in slu_data.iterate_batch_variable(data_dev, batch_size):
        word, char, labels, masks, lengths = batch
        _, _, preds = network.loss(word,
                                   char,
                                   labels,
                                   mask=masks,
                                   length=lengths,
                                   leading_symbolic=slu_data.NUM_SYMBOLIC_TAGS)
        writer.write(word.data.cpu().numpy(),
                     preds.data.cpu().numpy(),
                     labels.data.cpu().numpy(),
                     lengths.cpu().numpy())
    writer.close()
    dev_acc, dev_precision, dev_recall, dev_f1 = evaluate(
        tmp_filename, data_path, "dev", args.task, args.optim)

    # evaluate on test data when better performance detected
    tmp_filename = '%s/predictions/test_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestF1_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % (
        out_path, args.modelname, mode, num_epochs, batch_size, hidden_size,
        num_layers, dev_acc, dev_f1, best_epoch, args.optim,
        args.learning_rate, tag_space)
    #    tmp_filename = '%s/predictions/test_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % (out_path,args.modelname,mode,num_epochs,batch_size,hidden_size,num_layers,dev_acc,best_epoch, args.optim, args.learning_rate, tag_space)
    writer.start(tmp_filename)
    all_target = []
    all_preds = []
    for batch in slu_data.iterate_batch_variable(data_test, batch_size):
        word, char, labels, masks, lengths = batch
        _, _, preds = network.loss(word,
                                   char,
                                   labels,
                                   mask=masks,
                                   length=lengths,
                                   leading_symbolic=slu_data.NUM_SYMBOLIC_TAGS)
        writer.write(word.data.cpu().numpy(),
                     preds.data.cpu().numpy(),
                     labels.data.cpu().numpy(),
                     lengths.cpu().numpy())
    writer.close()
    test_acc, test_precision, test_recall, test_f1 = evaluate(
        tmp_filename, data_path, "test", args.task, args.optim)
    print(
        "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
        % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
    print(
        "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
        % (test_acc, test_precision, test_recall, test_f1, best_epoch))
    logger.info(
        "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
        % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
    logger.info(
        "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
        % (test_acc, test_precision, test_recall, test_f1, best_epoch))
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional RNN-CNN')
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=1000,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        help='Number of hidden units in RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=30,
                        help='Number of filters in CNN')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--dropout',
                        choices=['std', 'variational'],
                        help='type of dropout',
                        required=True)
    parser.add_argument('--p', type=float, default=0.5, help='dropout rate')
    parser.add_argument('--schedule',
                        type=int,
                        help='schedule for learning rate decay')
    parser.add_argument('--output_prediction',
                        action='store_true',
                        help='Output predictions to temp files')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("POSTagger")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule
    p = args.p
    output_predict = args.output_prediction

    embedd_dict, embedd_dim = utils.load_word_embedding_dict(
        'glove', "data/glove/glove.6B/glove.6B.100d.gz")
    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, \
    type_alphabet = conllx_data.create_alphabets("data/alphabets/pos/", train_path, data_paths=[dev_path,test_path],
                                                 max_vocabulary_size=50000, embedd_dict=embedd_dict)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = conllx_data.read_data_to_variable(train_path,
                                                   word_alphabet,
                                                   char_alphabet,
                                                   pos_alphabet,
                                                   type_alphabet,
                                                   use_gpu=use_gpu)
    # data_train = conllx_data.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    # num_data = sum([len(bucket) for bucket in data_train])
    num_data = sum(data_train[1])
    num_labels = pos_alphabet.size()

    data_dev = conllx_data.read_data_to_variable(dev_path,
                                                 word_alphabet,
                                                 char_alphabet,
                                                 pos_alphabet,
                                                 type_alphabet,
                                                 use_gpu=use_gpu)
    data_test = conllx_data.read_data_to_variable(test_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  pos_alphabet,
                                                  type_alphabet,
                                                  use_gpu=use_gpu)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conllx_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = 30
    window = 3
    num_layers = 1
    if args.dropout == 'std':
        network = BiRecurrentConv(embedd_dim,
                                  word_alphabet.size(),
                                  char_dim,
                                  char_alphabet.size(),
                                  num_filters,
                                  window,
                                  mode,
                                  hidden_size,
                                  num_layers,
                                  num_labels,
                                  embedd_word=word_table,
                                  p_rnn=p)
    else:
        network = BiVarRecurrentConv(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     num_filters,
                                     window,
                                     mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     embedd_word=word_table,
                                     p_rnn=p)
    if use_gpu:
        network.cuda()

    lr = learning_rate
    # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma)
    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)
    logger.info("Network: %s, num_layer=%d, hidden=%d, filter=%d" %
                (mode, num_layers, hidden_size, num_filters))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, dropout: %.2f)" %
        (gamma, num_data, batch_size, p))

    num_batches = num_data / batch_size + 1
    dev_correct = 0.0
    best_epoch = 0
    test_correct = 0.0
    test_total = 0
    for epoch in range(1, num_epochs + 1):
        print('Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (%d)): ' %
              (epoch, mode, args.dropout, lr, decay_rate, schedule))
        train_err = 0.
        train_corr = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, labels, _, _, masks, lengths = conllx_data.get_batch_variable(
                data_train, batch_size)

            optim.zero_grad()
            loss, corr, _ = network.loss(
                word,
                char,
                labels,
                mask=masks,
                leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
            loss.backward()
            optim.step()

            num_tokens = masks.data.sum()
            train_err += loss.data[0] * num_tokens
            train_corr += corr.data[0]
            train_total += num_tokens

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total,
                    train_corr * 100 / train_total, time_left)
                sys.stdout.write(log_info)
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' %
              (epoch * num_batches, train_err / train_total,
               train_corr * 100 / train_total, time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        dev_corr = 0.0
        dev_total = 0
        for batch in conllx_data.iterate_batch_variable(data_dev, batch_size):
            word, char, labels, _, _, masks, lengths = batch
            _, corr, preds = network.loss(
                word,
                char,
                labels,
                mask=masks,
                leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
            num_tokens = masks.data.sum()
            dev_corr += corr.data[0]
            dev_total += num_tokens
        print('dev corr: %d, total: %d, acc: %.2f%%' %
              (dev_corr, dev_total, dev_corr * 100 / dev_total))

        if dev_correct < dev_corr:
            dev_correct = dev_corr
            best_epoch = epoch

            # evaluate on test data when better performance detected
            test_corr = 0.0
            test_total = 0
            for batch in conllx_data.iterate_batch_variable(
                    data_test, batch_size):
                word, char, labels, _, _, masks, lengths = batch
                _, corr, preds = network.loss(
                    word,
                    char,
                    labels,
                    mask=masks,
                    leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
                num_tokens = masks.data.sum()
                test_corr += corr.data[0]
                test_total += num_tokens
            test_correct = test_corr
        print("best dev  corr: %d, total: %d, acc: %.2f%% (epoch: %d)" %
              (dev_correct, dev_total, dev_correct * 100 / dev_total,
               best_epoch))
        print("best test corr: %d, total: %d, acc: %.2f%% (epoch: %d)" %
              (test_correct, test_total, test_correct * 100 / test_total,
               best_epoch))

        if epoch in schedule:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)