Ejemplo n.º 1
0
    def __init__(self, train, test, embeddings_filename, batch_size=1):
        self.train_path = train
        self.test_path = test
        self.mode = 'LSTM'
        self.dropout = 'std'
        self.num_epochs = 1
        self.batch_size = batch_size
        self.hidden_size = 256
        self.num_filters = 30
        self.learning_rate = 0.01
        self.momentum = 0.9
        self.decay_rate = 0.05
        self.gamma = 0.0
        self.schedule = 1
        self.p_rnn = tuple([0.33, 0.5])
        self.p_in = 0.33
        self.p_out = 0.5
        self.unk_replace = 0.0
        self.bigram = True
        self.embedding = 'glove'
        self.logger = get_logger("NERCRF")
        self.char_dim = 30
        self.window = 3
        self.num_layers = 1
        self.tag_space = 128
        self.initializer = nn.init.xavier_uniform

        self.use_gpu = torch.cuda.is_available()

        self.embedd_dict, self.embedd_dim = utils.load_embedding_dict(
            self.embedding, embeddings_filename)
        self.word_alphabet, self.char_alphabet, self.pos_alphabet, \
        self.chunk_alphabet, self.ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner_crf/", self.train_path, data_paths=[self.test_path],
        embedd_dict=self.embedd_dict, max_vocabulary_size=50000)
        self.word_table = self.construct_word_embedding_table()

        self.logger.info("Word Alphabet Size: %d" % self.word_alphabet.size())
        self.logger.info("Character Alphabet Size: %d" %
                         self.char_alphabet.size())
        self.logger.info("POS Alphabet Size: %d" % self.pos_alphabet.size())
        self.logger.info("Chunk Alphabet Size: %d" %
                         self.chunk_alphabet.size())
        self.logger.info("NER Alphabet Size: %d" % self.ner_alphabet.size())
        self.num_labels = self.ner_alphabet.size()

        self.data_test = conll03_data.read_data_to_variable(
            self.test_path,
            self.word_alphabet,
            self.char_alphabet,
            self.pos_alphabet,
            self.chunk_alphabet,
            self.ner_alphabet,
            use_gpu=self.use_gpu,
            volatile=True)
        self.writer = CoNLL03Writer(self.word_alphabet, self.char_alphabet,
                                    self.pos_alphabet, self.chunk_alphabet,
                                    self.ner_alphabet)
Ejemplo n.º 2
0
def extract_features(data_name, feat_name):
    with open('temp/' + data_name, 'rb') as f:
        train_data = pickle.load(f)
    print(len(train_data))

    network = torch.load('temp/ner_tuned.pt')
    word_alphabet, char_alphabet, pos_alphabet, \
        chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("ner_alphabet/", None)

    feats = []
    for sent, mention, ont_types, weight in train_data:
        with open('tmp', 'w') as f:
            for i, word in enumerate(sent.words):
                f.write('{0} {1} -- -- O\n'.format(i + 1, word.word))
        sent_data = conll03_data.read_data_to_variable('tmp',
                                                       word_alphabet,
                                                       char_alphabet,
                                                       pos_alphabet,
                                                       chunk_alphabet,
                                                       ner_alphabet,
                                                       use_gpu=False,
                                                       volatile=True)
        os.system('rm tmp')
        word, char, pos, chunk, labels, masks, lengths = conll03_data.iterate_batch_variable(
            sent_data, 1).next()
        feat = network.feature(word,
                               char,
                               target=labels,
                               mask=masks,
                               leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
        print(feat.size())
        feat_vec = feat[0, mention['head_index'], :]
        feats.append((feat_vec.data.numpy(), ont_types, weight))
        print(np.shape(feats[-1][0]))

    with open('temp/' + feat_name, 'wb') as f:
        pickle.dump(feats, f)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional RNN-CNN-CRF')
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        help='Number of hidden units in RNN')
    parser.add_argument('--tag_space',
                        type=int,
                        default=0,
                        help='Dimension of tag space')
    parser.add_argument('--num_layers',
                        type=int,
                        default=1,
                        help='Number of layers of RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=30,
                        help='Number of filters in CNN')
    parser.add_argument('--char_dim',
                        type=int,
                        default=30,
                        help='Dimension of Character embeddings')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.015,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--dropout',
                        choices=['std', 'variational'],
                        help='type of dropout',
                        required=True)
    parser.add_argument('--p_rnn',
                        nargs=2,
                        type=float,
                        required=True,
                        help='dropout rate for RNN')
    parser.add_argument('--p_in',
                        type=float,
                        default=0.33,
                        help='dropout rate for input embeddings')
    parser.add_argument('--p_out',
                        type=float,
                        default=0.33,
                        help='dropout rate for output layer')
    parser.add_argument('--bigram',
                        action='store_true',
                        help='bi-gram parameter for CRF')
    parser.add_argument('--schedule',
                        type=int,
                        help='schedule for learning rate decay')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'sskip', 'polyglot'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("NERCRF")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    bigram = args.bigram
    embedding = args.embedding
    embedding_path = args.embedding_dict

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, \
    chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner_crf/", train_path, data_paths=[dev_path, test_path],
                                                                 embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = conll03_data.read_data_to_variable(train_path,
                                                    word_alphabet,
                                                    char_alphabet,
                                                    pos_alphabet,
                                                    chunk_alphabet,
                                                    ner_alphabet,
                                                    use_gpu=use_gpu)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data_to_variable(dev_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  pos_alphabet,
                                                  chunk_alphabet,
                                                  ner_alphabet,
                                                  use_gpu=use_gpu,
                                                  volatile=True)
    data_test = conll03_data.read_data_to_variable(test_path,
                                                   word_alphabet,
                                                   char_alphabet,
                                                   pos_alphabet,
                                                   chunk_alphabet,
                                                   ner_alphabet,
                                                   use_gpu=use_gpu,
                                                   volatile=True)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform
    if args.dropout == 'std':
        network = BiRecurrentConvCRF(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     num_filters,
                                     window,
                                     mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     tag_space=tag_space,
                                     embedd_word=word_table,
                                     p_in=p_in,
                                     p_out=p_out,
                                     p_rnn=p_rnn,
                                     bigram=bigram,
                                     initializer=initializer)
    else:
        network = BiVarRecurrentConvCRF(embedd_dim,
                                        word_alphabet.size(),
                                        char_dim,
                                        char_alphabet.size(),
                                        num_filters,
                                        window,
                                        mode,
                                        hidden_size,
                                        num_layers,
                                        num_labels,
                                        tag_space=tag_space,
                                        embedd_word=word_table,
                                        p_in=p_in,
                                        p_out=p_out,
                                        p_rnn=p_rnn,
                                        bigram=bigram,
                                        initializer=initializer)

    if use_gpu:
        network.cuda()

    lr = learning_rate
    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s"
        % (mode, num_layers, hidden_size, num_filters, tag_space,
           'bigram' if bigram else 'unigram'))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))

    num_batches = num_data / batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, args.dropout, lr, decay_rate, schedule))
        train_err = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, _, _, labels, masks, lengths = conll03_data.get_batch_variable(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss = network.loss(word, char, labels, mask=masks)
            loss.backward()
            optim.step()

            num_inst = word.size(0)
            train_err += loss.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, time: %.2fs' %
              (num_batches, train_err / train_total, time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch)
        writer.start(tmp_filename)

        for batch in conll03_data.iterate_batch_variable(data_dev, batch_size):
            word, char, pos, chunk, labels, masks, lengths = batch
            preds, _ = network.decode(
                word,
                char,
                target=labels,
                mask=masks,
                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
            writer.write(word.data.cpu().numpy(),
                         pos.data.cpu().numpy(),
                         chunk.data.cpu().numpy(),
                         preds.cpu().numpy(),
                         labels.data.cpu().numpy(),
                         lengths.cpu().numpy())
        writer.close()
        acc, precision, recall, f1 = evaluate(tmp_filename)
        print(
            'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' %
            (acc, precision, recall, f1))

        if dev_f1 < f1:
            dev_f1 = f1
            dev_acc = acc
            dev_precision = precision
            dev_recall = recall
            best_epoch = epoch

            # evaluate on test data when better performance detected
            tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch)
            writer.start(tmp_filename)

            for batch in conll03_data.iterate_batch_variable(
                    data_test, batch_size):
                word, char, pos, chunk, labels, masks, lengths = batch
                preds, _ = network.decode(
                    word,
                    char,
                    target=labels,
                    mask=masks,
                    leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.data.cpu().numpy(),
                             pos.data.cpu().numpy(),
                             chunk.data.cpu().numpy(),
                             preds.cpu().numpy(),
                             labels.data.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            test_acc, test_precision, test_recall, test_f1 = evaluate(
                tmp_filename)

        print(
            "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
            % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        print(
            "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
            % (test_acc, test_precision, test_recall, test_f1, best_epoch))

        if epoch % schedule == 0:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='NER with bi-directional RNN-CNN')
    parser.add_argument('--config',
                        type=str,
                        help='config file',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--loss_type',
                        choices=['sentence', 'token'],
                        default='sentence',
                        help='loss type (default: sentence)')
    parser.add_argument('--optim',
                        choices=['sgd', 'adam'],
                        help='type of optimizer',
                        required=True)
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--lr_decay',
                        type=float,
                        default=0.999995,
                        help='Decay rate of learning rate')
    parser.add_argument('--amsgrad', action='store_true', help='AMS Grad')
    parser.add_argument('--grad_clip',
                        type=float,
                        default=0,
                        help='max norm for gradient clip (default 0: no clip')
    parser.add_argument('--warmup_steps',
                        type=int,
                        default=0,
                        metavar='N',
                        help='number of steps to warm up (default: 0)')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0,
                        help='weight for l2 norm decay')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'sskip', 'polyglot'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--train',
                        help='path for training file.',
                        required=True)
    parser.add_argument('--dev', help='path for dev file.', required=True)
    parser.add_argument('--test', help='path for test file.', required=True)
    parser.add_argument('--model_path',
                        help='path for saving model file.',
                        required=True)

    args = parser.parse_args()

    logger = get_logger("NER")

    args.cuda = torch.cuda.is_available()
    device = torch.device('cuda', 0) if args.cuda else torch.device('cpu')
    train_path = args.train
    dev_path = args.dev
    test_path = args.test

    num_epochs = args.num_epochs
    batch_size = args.batch_size
    optim = args.optim
    learning_rate = args.learning_rate
    lr_decay = args.lr_decay
    amsgrad = args.amsgrad
    warmup_steps = args.warmup_steps
    weight_decay = args.weight_decay
    grad_clip = args.grad_clip

    loss_ty_token = args.loss_type == 'token'
    unk_replace = args.unk_replace

    model_path = args.model_path
    model_name = os.path.join(model_path, 'model.pt')
    embedding = args.embedding
    embedding_path = args.embedding_dict

    print(args)

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    alphabet_path = os.path.join(model_path, 'alphabets')
    word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet = conll03_data.create_alphabets(
        alphabet_path,
        train_path,
        data_paths=[dev_path, test_path],
        embedd_dict=embedd_dict,
        max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")

    data_train = conll03_data.read_bucketed_data(train_path, word_alphabet,
                                                 char_alphabet, pos_alphabet,
                                                 chunk_alphabet, ner_alphabet)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet,
                                      pos_alphabet, chunk_alphabet,
                                      ner_alphabet)
    data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet,
                                       pos_alphabet, chunk_alphabet,
                                       ner_alphabet)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()

    logger.info("constructing network...")

    hyps = json.load(open(args.config, 'r'))
    json.dump(hyps,
              open(os.path.join(model_path, 'config.json'), 'w'),
              indent=2)
    dropout = hyps['dropout']
    crf = hyps['crf']
    bigram = hyps['bigram']
    assert embedd_dim == hyps['embedd_dim']
    char_dim = hyps['char_dim']
    mode = hyps['rnn_mode']
    hidden_size = hyps['hidden_size']
    out_features = hyps['out_features']
    num_layers = hyps['num_layers']
    p_in = hyps['p_in']
    p_out = hyps['p_out']
    p_rnn = hyps['p_rnn']
    activation = hyps['activation']

    if dropout == 'std':
        if crf:
            network = BiRecurrentConvCRF(embedd_dim,
                                         word_alphabet.size(),
                                         char_dim,
                                         char_alphabet.size(),
                                         mode,
                                         hidden_size,
                                         out_features,
                                         num_layers,
                                         num_labels,
                                         embedd_word=word_table,
                                         p_in=p_in,
                                         p_out=p_out,
                                         p_rnn=p_rnn,
                                         bigram=bigram,
                                         activation=activation)
        else:
            network = BiRecurrentConv(embedd_dim,
                                      word_alphabet.size(),
                                      char_dim,
                                      char_alphabet.size(),
                                      mode,
                                      hidden_size,
                                      out_features,
                                      num_layers,
                                      num_labels,
                                      embedd_word=word_table,
                                      p_in=p_in,
                                      p_out=p_out,
                                      p_rnn=p_rnn,
                                      activation=activation)
    elif dropout == 'variational':
        if crf:
            network = BiVarRecurrentConvCRF(embedd_dim,
                                            word_alphabet.size(),
                                            char_dim,
                                            char_alphabet.size(),
                                            mode,
                                            hidden_size,
                                            out_features,
                                            num_layers,
                                            num_labels,
                                            embedd_word=word_table,
                                            p_in=p_in,
                                            p_out=p_out,
                                            p_rnn=p_rnn,
                                            bigram=bigram,
                                            activation=activation)
        else:
            network = BiVarRecurrentConv(embedd_dim,
                                         word_alphabet.size(),
                                         char_dim,
                                         char_alphabet.size(),
                                         mode,
                                         hidden_size,
                                         out_features,
                                         num_layers,
                                         num_labels,
                                         embedd_word=word_table,
                                         p_in=p_in,
                                         p_out=p_out,
                                         p_rnn=p_rnn,
                                         activation=activation)
    else:
        raise ValueError('Unkown dropout type: {}'.format(dropout))

    network = network.to(device)

    optimizer, scheduler = get_optimizer(network.parameters(), optim,
                                         learning_rate, lr_decay, amsgrad,
                                         weight_decay, warmup_steps)
    model = "{}-CNN{}".format(mode, "-CRF" if crf else "")
    logger.info("Network: %s, num_layer=%d, hidden=%d, act=%s" %
                (model, num_layers, hidden_size, activation))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (weight_decay, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): %s(%.2f, %.2f, %s)" %
                (dropout, p_in, p_out, p_rnn))
    print('# of Parameters: %d' %
          (sum([param.numel() for param in network.parameters()])))

    best_f1 = 0.0
    best_acc = 0.0
    best_precision = 0.0
    best_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    patient = 0
    num_batches = num_data // batch_size + 1
    result_path = os.path.join(model_path, 'tmp')
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    for epoch in range(1, num_epochs + 1):
        start_time = time.time()
        train_loss = 0.
        num_insts = 0
        num_words = 0
        num_back = 0
        network.train()
        lr = scheduler.get_lr()[0]
        print('Epoch %d (%s, lr=%.6f, lr decay=%.6f, amsgrad=%s, l2=%.1e): ' %
              (epoch, optim, lr, lr_decay, amsgrad, weight_decay))
        if args.cuda:
            torch.cuda.empty_cache()
        gc.collect()
        for step, data in enumerate(
                iterate_data(data_train,
                             batch_size,
                             bucketed=True,
                             unk_replace=unk_replace,
                             shuffle=True)):
            optimizer.zero_grad()
            words = data['WORD'].to(device)
            chars = data['CHAR'].to(device)
            labels = data['NER'].to(device)
            masks = data['MASK'].to(device)

            nbatch = words.size(0)
            nwords = masks.sum().item()

            loss_total = network.loss(words, chars, labels, mask=masks).sum()
            if loss_ty_token:
                loss = loss_total.div(nwords)
            else:
                loss = loss_total.div(nbatch)
            loss.backward()
            if grad_clip > 0:
                clip_grad_norm_(network.parameters(), grad_clip)
            optimizer.step()
            scheduler.step()

            with torch.no_grad():
                num_insts += nbatch
                num_words += nwords
                train_loss += loss_total.item()

            # update log
            if step % 100 == 0:
                torch.cuda.empty_cache()
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                curr_lr = scheduler.get_lr()[0]
                log_info = '[%d/%d (%.0f%%) lr=%.6f] loss: %.4f (%.4f)' % (
                    step, num_batches, 100. * step / num_batches, curr_lr,
                    train_loss / num_insts, train_loss / num_words)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('total: %d (%d), loss: %.4f (%.4f), time: %.2fs' %
              (num_insts, num_words, train_loss / num_insts,
               train_loss / num_words, time.time() - start_time))
        print('-' * 100)

        # evaluate performance on dev data
        with torch.no_grad():
            outfile = os.path.join(result_path, 'pred_dev%d' % epoch)
            scorefile = os.path.join(result_path, "score_dev%d" % epoch)
            acc, precision, recall, f1 = eval(data_dev, network, writer,
                                              outfile, scorefile, device)
            print(
                'Dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))
            if best_f1 < f1:
                torch.save(network.state_dict(), model_name)
                best_f1 = f1
                best_acc = acc
                best_precision = precision
                best_recall = recall
                best_epoch = epoch

                # evaluate on test data when better performance detected
                outfile = os.path.join(result_path, 'pred_test%d' % epoch)
                scorefile = os.path.join(result_path, "score_test%d" % epoch)
                test_acc, test_precision, test_recall, test_f1 = eval(
                    data_test, network, writer, outfile, scorefile, device)
                print(
                    'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                    % (test_acc, test_precision, test_recall, test_f1))
                patient = 0
            else:
                patient += 1
            print('-' * 100)

            print(
                "Best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))"
                % (best_acc, best_precision, best_recall, best_f1, best_epoch,
                   patient))
            print(
                "Best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))"
                % (test_acc, test_precision, test_recall, test_f1, best_epoch,
                   patient))
            print('=' * 100)

        if patient > 4:
            logger.info('reset optimizer momentums')
            scheduler.reset_state()
            patient = 0
Ejemplo n.º 5
0
def regen_train_data(nlp, fpath):
    ontology = OntologyType()
    decisions = ontology.load_decision_tree()
    network = torch.load('temp/ner_tuned.pt')
    word_alphabet, char_alphabet, pos_alphabet, \
        chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("ner_alphabet/", None)

    train_set = set()
    for fname in os.listdir('../../data/txt/'):
        if fname.endswith('.dump'):
            train_set.add(fname[:-5])
    print(train_set)

    train_data = []
    train_feat = []

    for root, dirs, files in os.walk('../../data/ltf/'):
        for file in files:
            if file in train_set:
                print(file)
                sents, doc = read_ltf_offset(os.path.join(root, file))
                for sent in sents:
                    named_ents, ners, feats = extract_ner(sent)
                    for mention, feat in zip(named_ents, feats):
                        prdt_type = infer_type(feat, decisions)
                        coherence = type_coherence(mention['type'], prdt_type,
                                                   ontology)
                        if coherence > 0:
                            train_data.append(
                                (sent, mention,
                                 [prdt_type] + ontology.lookup_all(prdt_type),
                                 coherence))
                            train_feat.append(
                                (feat,
                                 [prdt_type] + ontology.lookup_all(prdt_type),
                                 coherence))
                            print(sent.get_text())
                    nominals = extract_nominals(sent, nlp, ners)
                    for mention in nominals:
                        ont_types = ontology.lookup_all(mention['headword'])
                        if ont_types:
                            train_data.append((sent, mention, ont_types, 1.0))
                            with open('tmp', 'w') as f:
                                for i, word in enumerate(sent.words):
                                    f.write('{0} {1} -- -- O\n'.format(
                                        i + 1, word.word))
                            sent_data = conll03_data.read_data_to_variable(
                                'tmp',
                                word_alphabet,
                                char_alphabet,
                                pos_alphabet,
                                chunk_alphabet,
                                ner_alphabet,
                                use_gpu=False,
                                volatile=True)
                            os.system('rm tmp')
                            word, char, pos, chunk, labels, masks, lengths = conll03_data.iterate_batch_variable(
                                sent_data, 1).next()
                            feat = network.feature(
                                word,
                                char,
                                target=labels,
                                mask=masks,
                                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS
                            )
                            feat_vec = feat[0, mention['head_index'], :]
                            train_feat.append(
                                (feat_vec.data.numpy(), ont_types, 1.0))
                            print(sent.get_text())

    with open('temp/' + fpath + 'data.dump', 'wb') as f:
        pickle.dump(train_data, f)
    with open('temp/' + fpath + 'feat.dump', 'wb') as f:
        pickle.dump(train_feat, f)
def main():
    # Arguments parser
    parser = argparse.ArgumentParser(description='Tuning with DNN Model for NER')
    # Model Hyperparameters
    parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', default='LSTM')
    parser.add_argument('--encoder_mode', choices=['cnn', 'lstm'], help='Encoder type for sentence encoding',
                        default='lstm')
    parser.add_argument('--char_method', choices=['cnn', 'lstm'], help='Method to create character-level embeddings',
                        required=True)
    parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN for sentence level')
    parser.add_argument('--char_hidden_size', type=int, default=30, help='Output character-level embeddings size')
    parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings')
    parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space')
    parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN')
    parser.add_argument('--dropout', choices=['std', 'gcn'], help='Dropout method',
                        default='gcn')
    parser.add_argument('--p_em', type=float, default=0.33, help='dropout rate for input embeddings')
    parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input of RNN model')
    parser.add_argument('--p_rnn', nargs=3, type=float, required=True, help='dropout rate for RNN')
    parser.add_argument('--p_tag', type=float, default=0.33, help='dropout rate for output layer')
    parser.add_argument('--bigram', action='store_true', help='bi-gram parameter for CRF')

    parser.add_argument('--adj_attn', choices=['cossim', 'flex_cossim', 'flex_cossim2', 'concat', '', 'multihead'],
                        default='')

    # Data loading and storing params
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--dataset_name', type=str, default='alexa', help='Which dataset to use')
    parser.add_argument('--train', type=str, required=True, help='Path of train set')
    parser.add_argument('--dev', type=str, required=True, help='Path of dev set')
    parser.add_argument('--test', type=str, required=True, help='Path of test set')
    parser.add_argument('--results_folder', type=str, default='results', help='The folder to store results')
    parser.add_argument('--alphabets_folder', type=str, default='data/alphabets',
                        help='The folder to store alphabets files')

    # Training parameters
    parser.add_argument('--cuda', action='store_true', help='whether using GPU')
    parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch')
    parser.add_argument('--learning_rate', type=float, default=0.001, help='Base learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.95, help='Decay rate of learning rate')
    parser.add_argument('--schedule', type=int, default=3, help='schedule for learning rate decay')
    parser.add_argument('--gamma', type=float, default=0.0, help='weight for l2 regularization')
    parser.add_argument('--max_norm', type=float, default=1., help='Max norm for gradients')
    parser.add_argument('--gpu_id', type=int, nargs='+', required=True, help='which gpu to use for training')

    parser.add_argument('--learning_rate_gcn', type=float, default=5e-4, help='Base learning rate')
    parser.add_argument('--gcn_warmup', type=int, default=200, help='Base learning rate')
    parser.add_argument('--pretrain_lstm', type=float, default=10, help='Base learning rate')

    parser.add_argument('--adj_loss_lambda', type=float, default=0.)
    parser.add_argument('--lambda1', type=float, default=1.)
    parser.add_argument('--lambda2', type=float, default=0.)
    parser.add_argument('--seed', type=int, default=None)

    # Misc
    parser.add_argument('--embedding', choices=['glove', 'senna', 'alexa'], help='Embedding for words', required=True)
    parser.add_argument('--restore', action='store_true', help='whether restore from stored parameters')
    parser.add_argument('--save_checkpoint', type=str, default='', help='the path to save the model')
    parser.add_argument('--o_tag', type=str, default='O', help='The default tag for outside tag')
    parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK')
    parser.add_argument('--evaluate_raw_format', action='store_true', help='The tagging format for evaluation')


    parser.add_argument('--eval_type', type=str, default="micro_f1",choices=['micro_f1', 'acc'])
    parser.add_argument('--show_network', action='store_true', help='whether to display the network structure')
    parser.add_argument('--smooth', action='store_true', help='whether to skip all pdb break points')

    parser.add_argument('--uid', type=str, default='temp')
    parser.add_argument('--misc', type=str, default='')

    args = parser.parse_args()
    show_var(['args'])

    uid = args.uid
    results_folder = args.results_folder
    dataset_name = args.dataset_name
    use_tensorboard = True

    save_dset_dir = '{}../dset/{}/graph'.format(results_folder, dataset_name)
    result_file_path = '{}/{dataset}_{uid}_result'.format(results_folder, dataset=dataset_name, uid=uid)

    save_loss_path = '{}/{dataset}_{uid}_loss'.format(results_folder, dataset=dataset_name, uid=uid)
    save_lr_path = '{}/{dataset}_{uid}_lr'.format(results_folder, dataset=dataset_name, uid='temp')
    save_tb_path = '{}/tensorboard/'.format(results_folder)

    logger = get_logger("NERCRF")
    loss_recorder = LossRecorder(uid=uid)
    record = TensorboardLossRecord(use_tensorboard, save_tb_path, uid=uid)

    # rename the parameters
    mode = args.mode
    encoder_mode = args.encoder_mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    char_hidden_size = args.char_hidden_size
    char_method = args.char_method
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    max_norm = args.max_norm
    schedule = args.schedule
    dropout = args.dropout
    p_em = args.p_em
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_tag = args.p_tag
    unk_replace = args.unk_replace
    bigram = args.bigram
    embedding = args.embedding
    embedding_path = args.embedding_dict
    evaluate_raw_format = args.evaluate_raw_format
    o_tag = args.o_tag
    restore = args.restore
    save_checkpoint = args.save_checkpoint
    alphabets_folder = args.alphabets_folder
    use_elmo = False
    p_em_vec = 0.
    graph_model = 'gnn'
    coref_edge_filt = ''

    learning_rate_gcn = args.learning_rate_gcn
    gcn_warmup = args.gcn_warmup
    pretrain_lstm = args.pretrain_lstm

    adj_loss_lambda = args.adj_loss_lambda
    lambda1 = args.lambda1
    lambda2 = args.lambda2

    if args.smooth:
        import pdb
        pdb.set_trace = lambda: None

    misc = "{}".format(str(args.misc))

    score_file = "{}/{dataset}_{uid}_score".format(results_folder, dataset=dataset_name, uid=uid)

    for folder in [results_folder, alphabets_folder, save_dset_dir]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    def set_seed(seed):
        if not seed:
            seed = int(show_time())
        print("[Info] seed set to: {}".format(seed))
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

    set_seed(args.seed)

    embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, ner_alphabet = conll03_data.create_alphabets(
        "{}/{}/".format(alphabets_folder, dataset_name), train_path, data_paths=[dev_path, test_path],
        embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    device = torch.device('cuda') if args.cuda else torch.device('cpu')
    print(device)

    data_train = conll03_data.read_data(train_path, word_alphabet, char_alphabet,
                                        ner_alphabet,
                                        graph_model, batch_size, ori_order=False,
                                        total_batch="{}x".format(num_epochs + 1),
                                        unk_replace=unk_replace, device=device,
                                        save_path=save_dset_dir + '/train', coref_edge_filt=coref_edge_filt
                                        )
    # , shuffle=True,
    num_data = data_train.data_len
    num_labels = ner_alphabet.size()
    graph_types = data_train.meta_info['graph_types']

    data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet,
                                      ner_alphabet,
                                      graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device,
                                      save_path=save_dset_dir + '/dev',
                                      coref_edge_filt=coref_edge_filt)

    data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet,
                                       ner_alphabet,
                                       graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device,
                                       save_path=save_dset_dir + '/test',
                                       coref_edge_filt=coref_edge_filt)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform_

    p_gcn = [0.5, 0.5]

    d_graph = 256
    d_out = 256
    d_inner_hid = 128
    d_k = 32
    d_v = 32
    n_head = 4
    n_gcn_layer = 1

    p_rnn2 = [0.0, 0.5, 0.5]

    adj_attn = args.adj_attn
    mask_singles = True
    post_lstm = 1
    position_enc_mode = 'none'

    adj_memory = False

    if dropout == 'gcn':
        network = BiRecurrentConvGraphCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(),
                                          char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers,
                                          num_labels,
                                          graph_model, n_head, d_graph, d_inner_hid, d_k, d_v, p_gcn, n_gcn_layer,
                                          d_out, post_lstm=post_lstm, mask_singles=mask_singles,
                                          position_enc_mode=position_enc_mode, adj_attn=adj_attn,
                                          adj_loss_lambda=adj_loss_lambda,
                                          tag_space=tag_space, embedd_word=word_table,
                                          use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_tag=p_tag,
                                          p_rnn=p_rnn, p_rnn2=p_rnn2,
                                          bigram=bigram, initializer=initializer)

    elif dropout == 'std':
        network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size,
                                     window, mode, encoder_mode, hidden_size, num_layers, num_labels,
                                     tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec,
                                     p_em=p_em, p_in=p_in, p_tag=p_tag, p_rnn=p_rnn, bigram=bigram,
                                     initializer=initializer)

    # whether restore from trained model
    if restore:
        network.load_state_dict(torch.load(save_checkpoint + '_best.pth'))  # load trained model

    logger.info("cuda()ing network...")

    network = network.to(device)

    if dataset_name == 'conll03' and data_dev.data_len > 26:
        sample = data_dev.pad_batch(data_dev.dataset[25:26])
    else:
        sample = data_dev.pad_batch(data_dev.dataset[:1])
    plot_att_change(sample, network, record, save_tb_path + 'att/', uid='temp', epoch=0, device=device,
                    word_alphabet=word_alphabet, show_net=args.show_network,
                    graph_types=data_train.meta_info['graph_types'])

    logger.info("finished cuda()ing network...")

    lr = learning_rate
    lr_gcn = learning_rate_gcn
    optim = Optimizer('sgd', 'adam', network, dropout, lr=learning_rate,
                      lr_gcn=learning_rate_gcn,
                      wd=0., wd_gcn=0., momentum=momentum, lr_decay=decay_rate, schedule=schedule,
                      gcn_warmup=gcn_warmup,
                      pretrain_lstm=pretrain_lstm)
    nn.utils.clip_grad_norm_(network.parameters(), max_norm)
    logger.info(
        "Network: %s, encoder_mode=%s, num_layer=%d, hidden=%d, char_hidden_size=%d, char_method=%s, tag_space=%d, crf=%s" % \
        (mode, encoder_mode, num_layers, hidden_size, char_hidden_size, char_method, tag_space,
         'bigram' if bigram else 'unigram'))
    logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (
        gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_tag, p_rnn))

    num_batches = num_data // batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    best_test_f1 = 0.0
    best_test_acc = 0.0
    best_test_precision = 0.0
    best_test_recall = 0.0
    best_test_epoch = 0.0

    loss_recorder.start(save_loss_path, mode='w', misc=misc)
    fwrite('', save_lr_path)
    fwrite(json.dumps(vars(args)) + '\n', result_file_path)

    for epoch in range(1, num_epochs + 1):
        show_var(['misc'])

        lr_state = 'Epoch %d (uid=%s, lr=%.2E, lr_gcn=%.2E, decay rate=%.4f): ' % (
            epoch, uid, Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), decay_rate)
        print(lr_state)
        fwrite(lr_state[:-2] + '\n', save_lr_path, mode='a')

        train_err = 0.
        train_err2 = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch_i in range(1, num_batches + 1):

            batch_doc = data_train.next()
            char, word, posi, labels, feats, adjs, words_en = [batch_doc[i] for i in [
                "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]]

            sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent(
                word, char, labels)

            optim.zero_grad()

            adjs_into_model = adjs if adj_memory else adjs.clone()

            loss, (ner_loss, adj_loss) = network.loss(None, word, char, adjs_into_model, labels,
                                                      graph_types=graph_types, lambda1=lambda1, lambda2=lambda2)

            # loss = network.loss(_, sent_word, sent_char, sent_labels, mask=sent_mask)
            loss.backward()
            optim.step()

            with torch.no_grad():
                num_inst = sent_mask.size(0)
                train_err += ner_loss * num_inst
                train_err2 += adj_loss * num_inst
                train_total += num_inst

            time_ave = (time.time() - start_time) / batch_i
            time_left = (num_batches - batch_i) * time_ave

            # update log
            if batch_i % 20 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss1: %.4f, loss2: %.4f, time left (estimated): %.2fs' % (
                    batch_i, num_batches, train_err / train_total, train_err2 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

            optim.update(epoch, batch_i, num_batches, network)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, loss2: %.4f, time: %.2fs' % (
            num_batches, train_err / train_total, train_err2 / train_total, time.time() - start_time))

        # evaluate performance on dev data
        with torch.no_grad():
            network.eval()
            tmp_filename = "{}/{dataset}_{uid}_output_dev".format(results_folder, dataset=dataset_name, uid=uid)

            writer.start(tmp_filename)

            for batch in data_dev:
                char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [
                    "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]]
                sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent(
                    word, char, labels)

                preds, _ = network.decode(
                    None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS,
                    graph_types=graph_types)
                # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask,
                #                           leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(),
                             sent_length.cpu().numpy())
            writer.close()


            if args.eval_type == "acc":
                acc, precision, recall, f1 =evaluate_tokenacc(tmp_filename)
                f1 = acc
            else:
                acc, precision, recall, f1 = evaluate(tmp_filename, score_file, evaluate_raw_format, o_tag)

            print('dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1))

            # plot loss and attention
            record.plot_loss(epoch, train_err / train_total, f1)

            plot_att_change(sample, network, record, save_tb_path + 'att/', uid="{}_{:03d}".format(uid, epoch),
                            epoch=epoch, device=device,
                            word_alphabet=word_alphabet, show_net=False, graph_types=graph_types)

            if dev_f1 < f1:
                dev_f1 = f1
                dev_acc = acc
                dev_precision = precision
                dev_recall = recall
                best_epoch = epoch

                # evaluate on test data when better performance detected
                tmp_filename = "{}/{dataset}_{uid}_output_test".format(results_folder, dataset=dataset_name,
                                                                       uid=uid)
                writer.start(tmp_filename)

                for batch in data_test:
                    char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [
                        "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]]
                    sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent(
                        word, char, labels)

                    preds, _ = network.decode(
                        None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS,
                        graph_types=graph_types)
                    # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask,
                    #                           leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)

                    writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(),
                                 sent_length.cpu().numpy())
                writer.close()

                if args.eval_type == "acc":
                    test_acc, test_precision, test_recall, test_f1 = evaluate_tokenacc(tmp_filename)
                    test_f1 = test_acc
                else:
                    test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename, score_file, evaluate_raw_format,																		  o_tag)

                if best_test_f1 < test_f1:
                    best_test_acc, best_test_precision, best_test_recall, best_test_f1 = test_acc, test_precision, test_recall, test_f1
                    best_test_epoch = epoch

                # save the model parameters
                if save_checkpoint:
                    torch.save(network.state_dict(), save_checkpoint + '_best.pth')

            print("best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
                dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
            print("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
                test_acc, test_precision, test_recall, test_f1, best_epoch))
            print("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
                best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch))

        # optim.update(epoch, 1, num_batches, network)
        loss_recorder.write(epoch, train_err / train_total, train_err2 / train_total,
                            Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), f1, best_test_f1, test_f1)
    with open(result_file_path, 'a') as ofile:
        ofile.write("best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % (
            dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        ofile.write("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % (
            test_acc, test_precision, test_recall, test_f1, best_epoch))
        ofile.write("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n\n" % (
            best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch))

    record.close()

    print('Training finished!')
def main():
    embedding = 'glove'
    embedding_path = '/media/xianyang/OS/workspace/ner/glove.6B/glove.6B.100d.txt'
    word_alphabet, char_alphabet, pos_alphabet, \
    chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("/media/xianyang/OS/workspace/ner/NeuroNLP2/data/alphabets/ner_crf/", None)
    char_dim = 30
    num_filters = 30
    window = 3
    mode = 'LSTM'
    hidden_size = 256
    num_layers = 1
    num_labels = ner_alphabet.size()
    tag_space = 128
    p = 0.5
    bigram = True
    embedd_dim = 100
    use_gpu = False

    print(len(word_alphabet.get_content()['instances']))
    print(ner_alphabet.get_content())

    # writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
    network = BiRecurrentConvCRF(embedd_dim,
                                 word_alphabet.size(),
                                 char_dim,
                                 char_alphabet.size(),
                                 num_filters,
                                 window,
                                 mode,
                                 hidden_size,
                                 num_layers,
                                 num_labels,
                                 tag_space=tag_space,
                                 embedd_word=None,
                                 p_rnn=p,
                                 bigram=bigram)
    network.load_state_dict(torch.load('temp/23df51_model45'))

    ner_alphabet.add('B-VEH')
    ner_alphabet.add('I-VEH')
    ner_alphabet.add('B-WEA')
    ner_alphabet.add('I-WEA')
    num_new_word = 0

    with open('temp/target.train.conll', 'r') as f:
        sents = []
        sent_buffer = []
        for line in f:
            if len(line) <= 1:
                sents.append(sent_buffer)
                sent_buffer = []
            else:
                id, word, _, _, ner = line.strip().split()
                if word_alphabet.get_index(word) == 0:
                    word_alphabet.add(word)
                    num_new_word += 1
                sent_buffer.append((word_alphabet.get_index(word),
                                    ner_alphabet.get_index(ner)))

    print(len(word_alphabet.get_content()['instances']))
    print(ner_alphabet.get_content())

    init_embed = network.word_embedd.weight.data
    init_embed = np.concatenate(
        (init_embed, np.zeros((num_new_word, embedd_dim))), axis=0)
    network.word_embedd = Embedding(word_alphabet.size(), embedd_dim,
                                    torch.from_numpy(init_embed))

    old_crf = network.crf
    new_crf = ChainCRF(tag_space, ner_alphabet.size(), bigram=bigram)
    trans_matrix = np.zeros((new_crf.num_labels, old_crf.num_labels))
    for i in range(old_crf.num_labels):
        trans_matrix[i, i] = 1
    new_crf.state_nn.weight.data = torch.FloatTensor(
        np.dot(trans_matrix, old_crf.state_nn.weight.data))
    network.crf = new_crf

    target_train_data = conll03_data.read_data_to_variable(
        'temp/target.train.conll',
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        chunk_alphabet,
        ner_alphabet,
        use_gpu=False,
        volatile=False)
    target_dev_data = conll03_data.read_data_to_variable(
        'temp/target.dev.conll',
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        chunk_alphabet,
        ner_alphabet,
        use_gpu=False,
        volatile=False)
    target_test_data = conll03_data.read_data_to_variable(
        'temp/target.test.conll',
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        chunk_alphabet,
        ner_alphabet,
        use_gpu=False,
        volatile=False)

    num_epoch = 50
    batch_size = 32
    num_data = sum(target_train_data[1])
    num_batches = num_data / batch_size + 1
    unk_replace = 0.0
    # optim = SGD(network.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0, nesterov=True)
    optim = Adam(network.parameters(), lr=1e-3)

    for epoch in range(1, num_epoch + 1):
        train_err = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0
        network.train()

        for batch in range(1, num_batches + 1):
            word, char, _, _, labels, masks, lengths = conll03_data.get_batch_variable(
                target_train_data, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss = network.loss(word, char, labels, mask=masks)
            loss.backward()
            optim.step()

            num_inst = word.size(0)
            train_err += loss.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            if batch % 20 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d loss: %.4f, time: %.2fs' % (
                    num_batches, train_err / train_total,
                    time.time() - start_time)
                print(log_info)
                num_back = len(log_info)

        writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                               chunk_alphabet, ner_alphabet)
        os.system('rm temp/output.txt')
        writer.start('temp/output.txt')
        network.eval()
        for batch in conll03_data.iterate_batch_variable(
                target_dev_data, batch_size):
            word, char, pos, chunk, labels, masks, lengths, _ = batch
            preds, _, _ = network.decode(
                word,
                char,
                target=labels,
                mask=masks,
                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
            writer.write(word.data.cpu().numpy(),
                         pos.data.cpu().numpy(),
                         chunk.data.cpu().numpy(),
                         preds.cpu().numpy(),
                         labels.data.cpu().numpy(),
                         lengths.cpu().numpy())
        writer.close()

        acc, precision, recall, f1 = evaluate('temp/output.txt')
        log_info = 'dev: %f %f %f %f' % (acc, precision, recall, f1)
        print(log_info)

        if epoch % 10 == 0:
            writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                                   chunk_alphabet, ner_alphabet)
            os.system('rm temp/output.txt')
            writer.start('temp/output.txt')
            network.eval()
            for batch in conll03_data.iterate_batch_variable(
                    target_test_data, batch_size):
                word, char, pos, chunk, labels, masks, lengths, _ = batch
                preds, _, _ = network.decode(
                    word,
                    char,
                    target=labels,
                    mask=masks,
                    leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.data.cpu().numpy(),
                             pos.data.cpu().numpy(),
                             chunk.data.cpu().numpy(),
                             preds.cpu().numpy(),
                             labels.data.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()

            acc, precision, recall, f1 = evaluate('temp/output.txt')
            log_info = 'test: %f %f %f %f' % (acc, precision, recall, f1)
            print(log_info)

    torch.save(network, 'temp/tuned_0905.pt')
    alphabet_directory = '0905_alphabet/'
    word_alphabet.save(alphabet_directory)
    char_alphabet.save(alphabet_directory)
    pos_alphabet.save(alphabet_directory)
    chunk_alphabet.save(alphabet_directory)
    ner_alphabet.save(alphabet_directory)
Ejemplo n.º 8
0
def main():
    # Arguments parser
    parser = argparse.ArgumentParser(
        description='Tuning with DNN Model for NER')
    # Model Hyperparameters
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        default='LSTM')
    parser.add_argument('--encoder_mode',
                        choices=['cnn', 'lstm'],
                        help='Encoder type for sentence encoding',
                        default='lstm')
    parser.add_argument('--char_method',
                        choices=['cnn', 'lstm'],
                        help='Method to create character-level embeddings',
                        required=True)
    parser.add_argument(
        '--hidden_size',
        type=int,
        default=128,
        help='Number of hidden units in RNN for sentence level')
    parser.add_argument('--char_hidden_size',
                        type=int,
                        default=30,
                        help='Output character-level embeddings size')
    parser.add_argument('--char_dim',
                        type=int,
                        default=30,
                        help='Dimension of Character embeddings')
    parser.add_argument('--tag_space',
                        type=int,
                        default=0,
                        help='Dimension of tag space')
    parser.add_argument('--num_layers',
                        type=int,
                        default=1,
                        help='Number of layers of RNN')
    parser.add_argument('--dropout',
                        choices=['std', 'weight_drop'],
                        help='Dropout method',
                        default='weight_drop')
    parser.add_argument('--p_em',
                        type=float,
                        default=0.33,
                        help='dropout rate for input embeddings')
    parser.add_argument('--p_in',
                        type=float,
                        default=0.33,
                        help='dropout rate for input of RNN model')
    parser.add_argument('--p_rnn',
                        nargs=2,
                        type=float,
                        required=True,
                        help='dropout rate for RNN')
    parser.add_argument('--p_out',
                        type=float,
                        default=0.33,
                        help='dropout rate for output layer')
    parser.add_argument('--bigram',
                        action='store_true',
                        help='bi-gram parameter for CRF')

    # Data loading and storing params
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--dataset_name',
                        type=str,
                        default='alexa',
                        help='Which dataset to use')
    parser.add_argument('--train',
                        type=str,
                        required=True,
                        help='Path of train set')
    parser.add_argument('--dev',
                        type=str,
                        required=True,
                        help='Path of dev set')
    parser.add_argument('--test',
                        type=str,
                        required=True,
                        help='Path of test set')
    parser.add_argument('--results_folder',
                        type=str,
                        default='results',
                        help='The folder to store results')
    parser.add_argument('--tmp_folder',
                        type=str,
                        default='tmp',
                        help='The folder to store tmp files')
    parser.add_argument('--alphabets_folder',
                        type=str,
                        default='data/alphabets',
                        help='The folder to store alphabets files')
    parser.add_argument('--result_file_name',
                        type=str,
                        default='hyperparameters_tuning',
                        help='File name to store some results')
    parser.add_argument('--result_file_path',
                        type=str,
                        default='results/hyperparameters_tuning',
                        help='File name to store some results')

    # Training parameters
    parser.add_argument('--cuda',
                        action='store_true',
                        help='whether using GPU')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Base learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.95,
                        help='Decay rate of learning rate')
    parser.add_argument('--schedule',
                        type=int,
                        default=3,
                        help='schedule for learning rate decay')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for l2 regularization')
    parser.add_argument('--max_norm',
                        type=float,
                        default=1.,
                        help='Max norm for gradients')
    parser.add_argument('--gpu_id',
                        type=int,
                        nargs='+',
                        required=True,
                        help='which gpu to use for training')

    # Misc
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'alexa'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--restore',
                        action='store_true',
                        help='whether restore from stored parameters')
    parser.add_argument('--save_checkpoint',
                        type=str,
                        default='',
                        help='the path to save the model')
    parser.add_argument('--o_tag',
                        type=str,
                        default='O',
                        help='The default tag for outside tag')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--evaluate_raw_format',
                        action='store_true',
                        help='The tagging format for evaluation')

    args = parser.parse_args()

    logger = get_logger("NERCRF")

    # rename the parameters
    mode = args.mode
    encoder_mode = args.encoder_mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    char_hidden_size = args.char_hidden_size
    char_method = args.char_method
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    max_norm = args.max_norm
    schedule = args.schedule
    dropout = args.dropout
    p_em = args.p_em
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    bigram = args.bigram
    embedding = args.embedding
    embedding_path = args.embedding_dict
    dataset_name = args.dataset_name
    result_file_name = args.result_file_name
    evaluate_raw_format = args.evaluate_raw_format
    o_tag = args.o_tag
    restore = args.restore
    save_checkpoint = args.save_checkpoint
    gpu_id = args.gpu_id
    results_folder = args.results_folder
    tmp_folder = args.tmp_folder
    alphabets_folder = args.alphabets_folder
    use_elmo = False
    p_em_vec = 0.
    result_file_path = args.result_file_path

    score_file = "%s/score_gpu_%s" % (tmp_folder, '-'.join(map(str, gpu_id)))

    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    if not os.path.exists(tmp_folder):
        os.makedirs(tmp_folder)
    if not os.path.exists(alphabets_folder):
        os.makedirs(alphabets_folder)

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, ner_alphabet = conll03_data.create_alphabets(
        "{}/{}/".format(alphabets_folder, dataset_name),
        train_path,
        data_paths=[dev_path, test_path],
        embedd_dict=embedd_dict,
        max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    device = torch.device('cuda') if args.cuda else torch.device('cpu')
    print(device)

    data_train = conll03_data.read_data_to_tensor(train_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  ner_alphabet,
                                                  device=device)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data_to_tensor(dev_path,
                                                word_alphabet,
                                                char_alphabet,
                                                ner_alphabet,
                                                device=device)
    data_test = conll03_data.read_data_to_tensor(test_path,
                                                 word_alphabet,
                                                 char_alphabet,
                                                 ner_alphabet,
                                                 device=device)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform_
    if args.dropout == 'std':
        network = BiRecurrentConvCRF(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     char_hidden_size,
                                     window,
                                     mode,
                                     encoder_mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     tag_space=tag_space,
                                     embedd_word=word_table,
                                     use_elmo=use_elmo,
                                     p_em_vec=p_em_vec,
                                     p_em=p_em,
                                     p_in=p_in,
                                     p_out=p_out,
                                     p_rnn=p_rnn,
                                     bigram=bigram,
                                     initializer=initializer)
    elif args.dropout == 'var':
        network = BiVarRecurrentConvCRF(embedd_dim,
                                        word_alphabet.size(),
                                        char_dim,
                                        char_alphabet.size(),
                                        char_hidden_size,
                                        window,
                                        mode,
                                        encoder_mode,
                                        hidden_size,
                                        num_layers,
                                        num_labels,
                                        tag_space=tag_space,
                                        embedd_word=word_table,
                                        use_elmo=use_elmo,
                                        p_em_vec=p_em_vec,
                                        p_em=p_em,
                                        p_in=p_in,
                                        p_out=p_out,
                                        p_rnn=p_rnn,
                                        bigram=bigram,
                                        initializer=initializer)
    else:
        network = BiWeightDropRecurrentConvCRF(embedd_dim,
                                               word_alphabet.size(),
                                               char_dim,
                                               char_alphabet.size(),
                                               char_hidden_size,
                                               window,
                                               mode,
                                               encoder_mode,
                                               hidden_size,
                                               num_layers,
                                               num_labels,
                                               tag_space=tag_space,
                                               embedd_word=word_table,
                                               p_em=p_em,
                                               p_in=p_in,
                                               p_out=p_out,
                                               p_rnn=p_rnn,
                                               bigram=bigram,
                                               initializer=initializer)

    network = network.to(device)

    lr = learning_rate
    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)
    # optim = Adam(network.parameters(), lr=lr, weight_decay=gamma, amsgrad=True)
    nn.utils.clip_grad_norm_(network.parameters(), max_norm)
    logger.info("Network: %s, encoder_mode=%s, num_layer=%d, hidden=%d, char_hidden_size=%d, char_method=%s, tag_space=%d, crf=%s" % \
        (mode, encoder_mode, num_layers, hidden_size, char_hidden_size, char_method, tag_space, 'bigram' if bigram else 'unigram'))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))

    num_batches = num_data // batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    best_test_f1 = 0.0
    best_test_acc = 0.0
    best_test_precision = 0.0
    best_test_recall = 0.0
    best_test_epoch = 0.0
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, args.dropout, lr, decay_rate, schedule))

        train_err = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            _, word, char, labels, masks, lengths = conll03_data.get_batch_tensor(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss = network.loss(_, word, char, labels, mask=masks)
            loss.backward()
            optim.step()

            with torch.no_grad():
                num_inst = word.size(0)
                train_err += loss * num_inst
                train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 20 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, time: %.2fs' %
              (num_batches, train_err / train_total, time.time() - start_time))

        # evaluate performance on dev data
        with torch.no_grad():
            network.eval()
            tmp_filename = '%s/gpu_%s_dev' % (tmp_folder, '-'.join(
                map(str, gpu_id)))
            writer.start(tmp_filename)

            for batch in conll03_data.iterate_batch_tensor(
                    data_dev, batch_size):
                _, word, char, labels, masks, lengths = batch
                preds, _ = network.decode(
                    _,
                    word,
                    char,
                    target=labels,
                    mask=masks,
                    leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.cpu().numpy(),
                             preds.cpu().numpy(),
                             labels.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            acc, precision, recall, f1 = evaluate(tmp_filename, score_file,
                                                  evaluate_raw_format, o_tag)
            print(
                'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))

            if dev_f1 < f1:
                dev_f1 = f1
                dev_acc = acc
                dev_precision = precision
                dev_recall = recall
                best_epoch = epoch

                # evaluate on test data when better performance detected
                tmp_filename = '%s/gpu_%s_test' % (tmp_folder, '-'.join(
                    map(str, gpu_id)))
                writer.start(tmp_filename)

                for batch in conll03_data.iterate_batch_tensor(
                        data_test, batch_size):
                    _, word, char, labels, masks, lengths = batch
                    preds, _ = network.decode(
                        _,
                        word,
                        char,
                        target=labels,
                        mask=masks,
                        leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                    writer.write(word.cpu().numpy(),
                                 preds.cpu().numpy(),
                                 labels.cpu().numpy(),
                                 lengths.cpu().numpy())
                writer.close()
                test_acc, test_precision, test_recall, test_f1 = evaluate(
                    tmp_filename, score_file, evaluate_raw_format, o_tag)
                if best_test_f1 < test_f1:
                    best_test_acc, best_test_precision, best_test_recall, best_test_f1 = test_acc, test_precision, test_recall, test_f1
                    best_test_epoch = epoch

            print(
                "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
            print(
                "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (test_acc, test_precision, test_recall, test_f1, best_epoch))
            print(
                "overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (best_test_acc, best_test_precision, best_test_recall,
                   best_test_f1, best_test_epoch))

        if epoch % schedule == 0:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)

    with open(result_file_path, 'a') as ofile:
        ofile.write(
            "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n"
            % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        ofile.write(
            "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n"
            % (test_acc, test_precision, test_recall, test_f1, best_epoch))
        ofile.write(
            "overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n\n"
            % (best_test_acc, best_test_precision, best_test_recall,
               best_test_f1, best_test_epoch))
    print('Training finished!')
def sample():
    network = torch.load('temp/ner_active.pt')
    word_alphabet, char_alphabet, pos_alphabet, \
        chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("active_alphabet/", None)

    unannotated_data = conll03_data.read_data_to_variable(
        'temp/unannotated.conll',
        word_alphabet,
        char_alphabet,
        pos_alphabet,
        chunk_alphabet,
        ner_alphabet,
        use_gpu=False,
        volatile=True)

    annotated = set()
    with open('temp/annotated.conll', 'r') as f:
        sent_buffer = []
        for line in f:
            if len(line) > 1:
                _, word, _, _, _ = line.strip().split()
                sent_buffer.append(word)
            else:
                annotated.add(' '.join(sent_buffer))
                sent_buffer = []
    print('total annotated data: {}'.format(len(annotated)))

    uncertain = []
    max_sents = 100
    max_words = 500

    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)
    writer.start('temp/output.txt')
    network.eval()
    tiebreaker = count()
    for batch in conll03_data.iterate_batch_variable(unannotated_data, 32):
        word, char, pos, chunk, labels, masks, lengths, raws = batch
        preds, _, confidence = network.decode(
            word,
            char,
            target=labels,
            mask=masks,
            leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
        writer.write(word.data.cpu().numpy(),
                     pos.data.cpu().numpy(),
                     chunk.data.cpu().numpy(),
                     preds.cpu().numpy(),
                     labels.data.cpu().numpy(),
                     lengths.cpu().numpy())
        for _ in range(confidence.size()[0]):
            heapq.heappush(uncertain,
                           (confidence[_].numpy()[0] / lengths[_],
                            tiebreaker.next(), word[_].data.numpy(), raws[_]))
    writer.close()

    cost_sents = 0
    cost_words = 0
    with open('temp/query.conll', 'w') as q:
        while cost_sents < max_sents and cost_words < max_words and uncertain:
            sample = heapq.heappop(uncertain)
            if len(sample[3]) <= 5:
                continue
            # print(sample[0])
            # print([word_alphabet.get_instance(wid) for wid in sample[2]])
            print(sample[3])
            to_write = []
            for word in sample[3]:
                if is_url(word):
                    word = '<_URL>'
                to_write.append(word.encode('ascii', 'ignore'))
            if ' '.join(to_write) in annotated:
                continue
            for wn, word in enumerate(to_write):
                q.write('{0} {1} -- -- O\n'.format(wn + 1, word))
            q.write('\n')
            cost_sents += 1
            cost_words += len(sample[3])
def retrain(train_path, dev_path):
    network = torch.load('temp/ner_tuned.pt')
    word_alphabet, char_alphabet, pos_alphabet, \
        chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("ner_alphabet/", None)

    num_new_word = 0
    with open(train_path, 'r') as f:
        sents = []
        sent_buffer = []
        for line in f:
            if len(line) <= 1:
                sents.append(sent_buffer)
                sent_buffer = []
            else:
                id, word, _, _, ner = line.strip().split()
                if word_alphabet.get_index(word) == 0:
                    word_alphabet.add(word)
                    num_new_word += 1
                sent_buffer.append((word_alphabet.get_index(word),
                                    ner_alphabet.get_index(ner)))
    print('{} new words.'.format(num_new_word))
    init_embed = network.word_embedd.weight.data
    embedd_dim = init_embed.shape[1]
    init_embed = np.concatenate(
        (init_embed, np.zeros((num_new_word, embedd_dim))), axis=0)
    network.word_embedd = Embedding(word_alphabet.size(), embedd_dim,
                                    torch.from_numpy(init_embed))

    target_train_data = conll03_data.read_data_to_variable(train_path,
                                                           word_alphabet,
                                                           char_alphabet,
                                                           pos_alphabet,
                                                           chunk_alphabet,
                                                           ner_alphabet,
                                                           use_gpu=False,
                                                           volatile=False)

    num_epoch = 50
    batch_size = 20
    num_data = sum(target_train_data[1])
    num_batches = num_data / batch_size + 1
    unk_replace = 0.0
    optim = SGD(network.parameters(),
                lr=0.01,
                momentum=0.9,
                weight_decay=0.0,
                nesterov=True)

    for epoch in range(num_epoch):
        train_err = 0.
        train_total = 0.
        start_time = time.time()
        num_back = 0
        network.train()

        for batch in range(1, num_batches + 1):
            word, char, _, _, labels, masks, lengths = conll03_data.get_batch_variable(
                target_train_data, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss = network.loss(word, char, labels, mask=masks)
            loss.backward()
            optim.step()

            num_inst = word.size(0)
            train_err += loss.data[0] * num_inst
            train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            print('train: %d loss: %.4f, time: %.2fs' %
                  (num_batches, train_err / train_total,
                   time.time() - start_time))

    torch.save(network, 'temp/ner_active.pt')
    alphabet_directory = 'active_alphabet/'
    word_alphabet.save(alphabet_directory)
    char_alphabet.save(alphabet_directory)
    pos_alphabet.save(alphabet_directory)
    chunk_alphabet.save(alphabet_directory)
    ner_alphabet.save(alphabet_directory)

    target_dev_data = conll03_data.read_data_to_variable(dev_path,
                                                         word_alphabet,
                                                         char_alphabet,
                                                         pos_alphabet,
                                                         chunk_alphabet,
                                                         ner_alphabet,
                                                         use_gpu=False,
                                                         volatile=False)
    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)
    os.system('rm output.txt')
    writer.start('output.txt')
    network.eval()
    for batch in conll03_data.iterate_batch_variable(target_dev_data,
                                                     batch_size):
        word, char, pos, chunk, labels, masks, lengths, _ = batch
        preds, _, _ = network.decode(
            word,
            char,
            target=labels,
            mask=masks,
            leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
        writer.write(word.data.cpu().numpy(),
                     pos.data.cpu().numpy(),
                     chunk.data.cpu().numpy(),
                     preds.cpu().numpy(),
                     labels.data.cpu().numpy(),
                     lengths.cpu().numpy())
    writer.close()

    acc, precision, recall, f1 = evaluate('output.txt')
    print(acc, precision, recall, f1)
    return acc, precision, recall, f1