def decode(options):
    device = get_device(options)
    output_dir = options.output_dir

    try:
        os.makedirs(output_dir)
    except:
        pass

    prefix = options.test.split('_')[0] if len(options.test.split('_')) > 1 else options.test.split('.')[0]
    # Load the trained model
    model, model_args = load_model_state(options.model, device)
    model = model.to(device)
    model.eval()

    ds = TextDataSet.create(
        df_path=os.path.join(options.data_dir, options.test),
        idx2labels_path=options.idx2labels,
        model_name=model_args.model_name,
        model_type=model_args.model_type,
        markup='BIO',
        max_sequence_length=model_args.max_seq_len
    )
    dl = TextDataLoader(ds, device=device, batch_size=options.batch_size, shuffle=False)
    pad_id = 0

    with open(f'{output_dir}/{prefix}_label_bert.txt', 'w') as t, \
            open(f'{output_dir}/{prefix}_predict_bert.txt', 'w') as p, \
            open(f'{output_dir}/{prefix}_text_bert.txt', 'w') as textf:
        with torch.no_grad():
            preds = predict(dl, model, ds.idx2label, pad_id=pad_id)
            pred_tokens, pred_labels = bert_labels2tokens(dl, preds)
            true_tokens, true_labels = bert_labels2tokens(dl, [x.bert_labels for x in dl.dataset])

            assert pred_tokens == true_tokens
            tokens_report = flat_classification_report(true_labels, pred_labels,
                                                       labels=ds.idx2label[4:], digits=4)
            print(tokens_report)
            t.write('\n'.join([' '.join([item for item in t_label]) for t_label in true_labels]) + '\n')
            p.write('\n'.join([' '.join([item for item in p_label]) for p_label in pred_labels]) + '\n')
            textf.write('\n'.join([' '.join([item for item in t_token]) for t_token in true_tokens]) + '\n')
def train(args):
    vocab_path = os.path.join(args.data_dir, args.vocab)
    tag_path = os.path.join(args.data_dir, args.tag_set)
    word_to_idx, idx_to_word, tag_to_idx, idx_to_tag = load_vocabs(vocab_path, tag_path)
    train_sentences, train_labels, test_sentences, test_labels = prepare_text(args, tag_to_idx)

    device = get_device(args)
    start = time.time()
    bert_embedding1 = TransformerWordEmbeddings('distilbert-base-multilingual-cased',
                                                layers='-1',
                                                batch_size=args.batch_size,
                                                pooling_operation=args.pooling_operation,
                                                )

    bert_embedding2 = TransformerWordEmbeddings('distilroberta-base',
                                                layers='-1',
                                                batch_size=args.batch_size,
                                                pooling_operation=args.pooling_operation,
                                                )

    bert_embedding3 = TransformerWordEmbeddings('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens',
                                                layers='-1',
                                                batch_size=args.batch_size,
                                                pooling_operation=args.pooling_operation
                                                )

    encoder = StackTransformerEmbeddings([bert_embedding1, bert_embedding2, bert_embedding3])

    train_sentences_encoded = encoder.encode(train_sentences)
    test_sentences_encoded = encoder.encode(test_sentences)

    print(f'Encoding time:{time.time() - start}')

    # Update the Namespace
    args.vocab_size = len(idx_to_word)
    args.number_of_tags = len(idx_to_tag)

    # Update the embedding dim
    args.embedding_dim = encoder.embedding_length

    model = build_model(args, device)
    print(model)
    model = model.to(device)

    # optimizer = torch.optim.Adam(model.parameters())
    betas = (0.9, 0.999)
    eps = 1e-8
    optimizer = BertAdam(model, lr=args.learning_rate, b1=betas[0], b2=betas[1], e=eps)

    pad_id = word_to_idx['PAD']
    pad_id_labels = tag_to_idx['PAD']

    batcher = SamplingBatcherStackedTransformers(np.asarray(train_sentences_encoded, dtype=object),
                                                 np.asarray(train_labels, dtype=object),
                                                 batch_size=args.batch_size,
                                                 pad_id=pad_id,
                                                 pad_id_labels=pad_id_labels,
                                                 embedding_length=encoder.embedding_length,
                                                 device=device)

    updates = 1
    total_loss = 0
    best_loss = +inf
    stop_training = False
    output_dir = args.output_dir
    try:
        os.makedirs(output_dir)
    except:
        pass

    prefix = args.train_text.split('_')[0] if len(args.train_text.split('_')) > 1 \
        else args.train_text.split('.')[0]

    start_time = time.time()
    for epoch in range(args.epochs):
        for batch in batcher:
            updates += 1
            input_, labels, labels_mask = batch
            optimizer.zero_grad()
            loss = model.score(batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.data
            if updates % args.patience == 0:
                print(f'Epoch: {epoch}, Updates:{updates}, Loss: {total_loss}')
                if best_loss > total_loss:
                    save_state(f'{output_dir}/{prefix}_best_model.pt', model, loss_fn, optimizer,
                               updates, args=args)
                    best_loss = total_loss
                total_loss = 0
            if updates % args.max_steps == 0:
                stop_training = True
                break

        if stop_training:
            break

    print('Training time:{}'.format(time.time() - start_time))

    def get_idx_to_tag(label_ids):
        return [idx_to_tag.get(idx) for idx in label_ids]

    def get_idx_to_word(words_ids):
        return [idx_to_word.get(idx) for idx in words_ids]

    model, model_args = load_model_state(f'{output_dir}/{prefix}_best_model.pt', device)
    model = model.to(device)
    batcher_test = SamplingBatcherStackedTransformers(np.asarray(test_sentences_encoded, dtype=object),
                                                      np.asarray(test_labels, dtype=object),
                                                      batch_size=args.batch_size,
                                                      pad_id=pad_id,
                                                      pad_id_labels=pad_id_labels,
                                                      embedding_length=encoder.embedding_length,
                                                      device=device)
    ne_class_list = set()
    true_labels_for_testing = []
    results_of_prediction = []
    with open(f'{output_dir}/{prefix}_label.txt', 'w', encoding='utf8') as t, \
            open(f'{output_dir}/{prefix}_predict.txt', 'w', encoding='utf8') as p, \
            open(f'{output_dir}/{prefix}_text.txt', 'w', encoding='utf8') as textf:
        with torch.no_grad():
            # predict() method returns final labels not the label_ids
            preds = predict_no_attn(batcher_test, model, idx_to_tag)
            cnt = 0
            for text, labels, predict_labels in zip(test_sentences, test_labels, preds):
                cnt += 1
                tag_labels_true = get_idx_to_tag(labels)
                text_ = text

                tag_labels_predicted = ' '.join(predict_labels)
                tag_labels_true = ' '.join(tag_labels_true)

                p.write(tag_labels_predicted + '\n')
                t.write(tag_labels_true + '\n')
                textf.write(text_ + '\n')

                tag_labels_true = tag_labels_true.strip().replace('_', '-').split()
                tag_labels_predicted = tag_labels_predicted.strip().replace('_', '-').split()
                biluo_tags_true = get_biluo(tag_labels_true)
                biluo_tags_predicted = get_biluo(tag_labels_predicted)

                doc = Doc(text_)
                offset_true_labels = offset_from_biluo(doc, biluo_tags_true)
                offset_predicted_labels = offset_from_biluo(doc, biluo_tags_predicted)

                ent_labels = dict()
                for ent in offset_true_labels:
                    start, stop, ent_type = ent
                    ent_type = ent_type.replace('_', '')
                    ne_class_list.add(ent_type)
                    if ent_type in ent_labels:
                        ent_labels[ent_type].append((start, stop))
                    else:
                        ent_labels[ent_type] = [(start, stop)]
                true_labels_for_testing.append(ent_labels)

                ent_labels = dict()
                for ent in offset_predicted_labels:
                    start, stop, ent_type = ent
                    ent_type = ent_type.replace('_', '')
                    if ent_type in ent_labels:
                        ent_labels[ent_type].append((start, stop))
                    else:
                        ent_labels[ent_type] = [(start, stop)]
                results_of_prediction.append(ent_labels)

    from eval.quality import calculate_prediction_quality
    f1, precision, recall, results = \
        calculate_prediction_quality(true_labels_for_testing,
                                     results_of_prediction,
                                     tuple(ne_class_list))
    print(f1, precision, recall, results)
def decode(options):
    prefix = options.test_text.split('_')[0] if len(options.test_text.split('_')) > 1 \
        else options.test_text.split('.')[0]

    device = get_device(args)
    output_dir = options.output_dir
    try:
        os.makedirs(output_dir)
    except:
        pass
    model, model_args = load_model_state(options.model, device)
    model = model.to(device)

    vocab_path = os.path.join(model_args.data_dir, model_args.vocab)
    tag_path = os.path.join(model_args.data_dir, model_args.tag_set)
    word_to_idx, idx_to_word, tag_to_idx, idx_to_tag = load_vocabs(
        vocab_path, tag_path)

    *_, test_sentences, test_labels = prepare(options, word_to_idx, tag_to_idx)

    def get_idx_to_tag(label_ids):
        return [idx_to_tag.get(idx) for idx in label_ids]

    def get_idx_to_word(words_ids):
        return [idx_to_word.get(idx) for idx in words_ids]

    pad_id = word_to_idx['PAD']
    pad_id_labels = tag_to_idx['PAD']
    batcher_test = SamplingBatcher(np.asarray(test_sentences, dtype=object),
                                   np.asarray(test_labels, dtype=object),
                                   batch_size=args.batch_size,
                                   pad_id=pad_id,
                                   pad_id_labels=pad_id_labels)
    ne_class_list = set()
    true_labels_for_testing = []
    results_of_prediction = []
    with open(f'{output_dir}/{prefix}_label.txt', 'w', encoding='utf8') as t, \
            open(f'{output_dir}/{prefix}_predict.txt', 'w', encoding='utf8') as p, \
            open(f'{output_dir}/{prefix}_text.txt', 'w', encoding='utf8') as textf:
        with torch.no_grad():
            preds = predict(batcher_test, model, idx_to_tag, pad_id=pad_id)
            cnt = 0
            for text, labels, predict_labels in zip(test_sentences,
                                                    test_labels, preds):
                cnt += 1
                tag_labels_true = get_idx_to_tag(labels)
                text_ = get_idx_to_word(text)

                tag_labels_predicted = ' '.join(predict_labels)
                tag_labels_true = ' '.join(tag_labels_true)
                text_ = ' '.join(text_)
                p.write(tag_labels_predicted + '\n')
                t.write(tag_labels_true + '\n')
                textf.write(text_ + '\n')

                tag_labels_true = tag_labels_true.strip().replace('_',
                                                                  '-').split()
                tag_labels_predicted = tag_labels_predicted.strip().replace(
                    '_', '-').split()
                biluo_tags_true = get_biluo(tag_labels_true)
                biluo_tags_predicted = get_biluo(tag_labels_predicted)

                doc = Doc(text_)
                offset_true_labels = offset_from_biluo(doc, biluo_tags_true)
                offset_predicted_labels = offset_from_biluo(
                    doc, biluo_tags_predicted)

                ent_labels = dict()
                for ent in offset_true_labels:
                    start, stop, ent_type = ent
                    ent_type = ent_type.replace('_', '')
                    ne_class_list.add(ent_type)
                    if ent_type in ent_labels:
                        ent_labels[ent_type].append((start, stop))
                    else:
                        ent_labels[ent_type] = [(start, stop)]
                true_labels_for_testing.append(ent_labels)

                ent_labels = dict()
                for ent in offset_predicted_labels:
                    start, stop, ent_type = ent
                    ent_type = ent_type.replace('_', '')
                    if ent_type in ent_labels:
                        ent_labels[ent_type].append((start, stop))
                    else:
                        ent_labels[ent_type] = [(start, stop)]
                results_of_prediction.append(ent_labels)

    from eval.quality import calculate_prediction_quality
    f1, precision, recall, results = \
        calculate_prediction_quality(true_labels_for_testing,
                                     results_of_prediction,
                                     tuple(ne_class_list))
    print(f1, precision, recall, results)
Example #4
0
def train(args):
    device = get_device(args)

    if args.cache_features:
        args.shuffle = False

    data = bert_data.LearnData.create(
        train_df_path=os.path.join(args.data_dir, args.train),
        valid_df_path=os.path.join(args.data_dir, args.test),
        idx2labels_path=os.path.join(args.data_dir, args.idx2labels),
        clear_cache=True,
        model_name=args.model_name,
        batch_size=args.batch_size,
        device=device,
        markup='BIO',
        max_sequence_length=args.max_seq_len,
        shuffle=args.shuffle,
    )

    args.number_of_tags = len(data.train_ds.idx2label)
    model = build_model(args, device)
    model = model.to(device)
    model.train()

    betas = (0.9, 0.999)
    eps = 1e-8
    optimizer = BertAdam(model,
                         lr=args.learning_rate,
                         b1=betas[0],
                         b2=betas[1],
                         e=eps)

    pad_id = 0  # This is pad_id of BERT model
    updates = 1
    total_loss = 0
    best_loss = +inf
    stop_training = False
    output_dir = args.output_dir
    try:
        os.makedirs(output_dir)
    except:
        pass

    prefix = args.train.split('_')[0] if len(
        args.train.split('_')) > 1 else args.train.split('.')[0]

    start = time.time()
    for epoch in range(args.epochs):
        for batch in data.train_dl:
            updates += 1
            optimizer.zero_grad()
            input_, labels_mask, input_type_ids, labels = batch
            # Create attn mask
            attn_mask = get_attn_pad_mask(input_, input_, pad_id)
            loss = model.score(batch, attn_mask=attn_mask)

            loss.backward()
            optimizer.step()
            total_loss += loss.data

            if updates % args.patience == 0:
                print(f'Epoch: {epoch}, Updates:{updates}, Loss: {total_loss}')
                if best_loss > total_loss:
                    save_state(f'{output_dir}/{prefix}_best_model_bert.pt',
                               model,
                               loss_fn,
                               optimizer,
                               updates,
                               args=args)
                    best_loss = total_loss
                total_loss = 0

            if updates % args.max_steps == 0:
                stop_training = True
                break

        if stop_training:
            break
    print(f'Training time: {time.time() - start}')

    model, model_args = load_model_state(
        f'{output_dir}/{prefix}_best_model_bert.pt', device)
    model = model.to(device)
    dl = get_data_loader_for_predict(data,
                                     df_path=os.path.join(
                                         args.data_dir, args.test))

    with open(f'{output_dir}/{prefix}_label_bert.txt', 'w') as t, \
            open(f'{output_dir}/{prefix}_predict_bert.txt', 'w') as p, \
            open(f'{output_dir}/{prefix}_text_bert.txt', 'w') as textf:
        with torch.no_grad():
            preds = predict(dl, model, data.train_ds.idx2label, pad_id=pad_id)
            pred_tokens, pred_labels = bert_labels2tokens(dl, preds)
            true_tokens, true_labels = bert_labels2tokens(
                dl, [x.bert_labels for x in dl.dataset])
            # print(true_tokens, true_labels)
            assert pred_tokens == true_tokens
            tokens_report = flat_classification_report(
                true_labels,
                pred_labels,
                labels=data.train_ds.idx2label[4:],
                digits=4)
            print(tokens_report)
            t.write('\n'.join([
                ' '.join([item for item in t_label]) for t_label in true_labels
            ]) + '\n')
            p.write('\n'.join([
                ' '.join([item for item in p_label]) for p_label in pred_labels
            ]) + '\n')
            textf.write('\n'.join([
                ' '.join([item for item in t_token]) for t_token in true_tokens
            ]) + '\n')
Example #5
0
def train(args):
    vocab_path = os.path.join(args.data_dir, args.vocab)
    tag_path = os.path.join(args.data_dir, args.tag_set)
    word_to_idx, idx_to_word, tag_to_idx, idx_to_tag = load_vocabs(
        vocab_path, tag_path)
    train_sentences, train_labels, test_sentences, test_labels = prepare_flair(
        args, tag_to_idx)

    device = get_device(args)
    flair.device = device

    start = time.time()
    # flair_forward_embedding = FlairEmbeddings('multi-forward')
    # flair_backward_embedding = FlairEmbeddings('multi-backward')
    # init multilingual BERT
    bert_embedding = TransformerWordEmbeddings(
        'distilbert-base-multilingual-cased',
        layers='-1',
        batch_size=args.batch_size)
    # bert_embedding1 = TransformerWordEmbeddings('sentence-transformers/'
    #                                             'distilbert-multilingual-nli-stsb-quora-ranking',
    #                                             layers='-1',
    #                                             batch_size=args.batch_size)
    # bert_embedding2 = TransformerWordEmbeddings('sentence-transformers/quora-distilbert-multilingual',
    #                                             layers='-1',
    #                                             batch_size=args.batch_size)
    # now create the StackedEmbedding object that combines all embeddings
    embeddings = StackedEmbeddings(embeddings=[bert_embedding])

    # Embed words in the train and test sentence
    start_idx = 0
    n_samples = len(train_sentences)
    while start_idx < n_samples + args.batch_size:
        batch_slice = train_sentences[
            start_idx:min(start_idx + args.batch_size, n_samples)]
        start_idx += args.batch_size
        embeddings.embed(batch_slice)

    start_idx = 0
    n_samples = len(test_sentences)
    while start_idx <= n_samples + args.batch_size:
        batch_slice = test_sentences[start_idx:min(start_idx +
                                                   args.batch_size, n_samples)]
        start_idx += args.batch_size
        embeddings.embed(batch_slice)

    print(f'Encoding time:{time.time() - start}')

    # Update the Namespace
    args.vocab_size = len(idx_to_word)
    args.number_of_tags = len(idx_to_tag)

    model = build_model(args, device)
    print(model)
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters())

    pad_id = word_to_idx['PAD']
    pad_id_labels = tag_to_idx['PAD']

    batcher = SamplingBatcherFlair(
        np.asarray(train_sentences, dtype=object),
        np.asarray(train_labels, dtype=object),
        batch_size=args.batch_size,
        pad_id=pad_id,
        pad_id_labels=pad_id_labels,
        embedding_length=embeddings.embedding_length)

    updates = 1
    total_loss = 0
    best_loss = +inf
    stop_training = False
    output_dir = args.output_dir
    try:
        os.makedirs(output_dir)
    except:
        pass

    prefix = args.train_text.split('_')[0] if len(args.train_text.split('_')) > 1 \
        else args.train_text.split('.')[0]

    start_time = time.time()
    for epoch in range(args.epochs):
        for batch in batcher:
            updates += 1
            input_, labels, labels_mask = batch
            optimizer.zero_grad()
            loss = model.score(batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.data
            if updates % args.patience == 0:
                print(f'Epoch: {epoch}, Updates:{updates}, Loss: {total_loss}')
                if best_loss > total_loss:
                    save_state(f'{output_dir}/{prefix}_best_model.pt',
                               model,
                               loss_fn,
                               optimizer,
                               updates,
                               args=args)
                    best_loss = total_loss
                total_loss = 0
            if updates % args.max_steps == 0:
                stop_training = True
                break

        if stop_training:
            break

    print('Training time:{}'.format(time.time() - start_time))

    def get_idx_to_tag(label_ids):
        return [idx_to_tag.get(idx) for idx in label_ids]

    def get_idx_to_word(words_ids):
        return [idx_to_word.get(idx) for idx in words_ids]

    model, model_args = load_model_state(
        f'{output_dir}/{prefix}_best_model.pt', device)
    model = model.to(device)
    batcher_test = SamplingBatcherFlair(
        np.asarray(test_sentences, dtype=object),
        np.asarray(test_labels, dtype=object),
        batch_size=args.batch_size,
        pad_id=pad_id,
        pad_id_labels=pad_id_labels,
        embedding_length=embeddings.embedding_length)
    ne_class_list = set()
    true_labels_for_testing = []
    results_of_prediction = []
    with open(f'{output_dir}/{prefix}_label.txt', 'w', encoding='utf8') as t, \
            open(f'{output_dir}/{prefix}_predict.txt', 'w', encoding='utf8') as p, \
            open(f'{output_dir}/{prefix}_text.txt', 'w', encoding='utf8') as textf:
        with torch.no_grad():
            # predict() method returns final labels not the label_ids
            preds = predict_no_attn(batcher_test, model, idx_to_tag)
            cnt = 0
            for text, labels, predict_labels in zip(test_sentences,
                                                    test_labels, preds):
                cnt += 1
                tag_labels_true = get_idx_to_tag(labels)
                text_ = text.to_original_text()

                tag_labels_predicted = ' '.join(predict_labels)
                tag_labels_true = ' '.join(tag_labels_true)

                p.write(tag_labels_predicted + '\n')
                t.write(tag_labels_true + '\n')
                textf.write(text_ + '\n')

                tag_labels_true = tag_labels_true.strip().replace('_',
                                                                  '-').split()
                tag_labels_predicted = tag_labels_predicted.strip().replace(
                    '_', '-').split()
                biluo_tags_true = get_biluo(tag_labels_true)
                biluo_tags_predicted = get_biluo(tag_labels_predicted)

                doc = Doc(text_)
                offset_true_labels = offset_from_biluo(doc, biluo_tags_true)
                offset_predicted_labels = offset_from_biluo(
                    doc, biluo_tags_predicted)

                ent_labels = dict()
                for ent in offset_true_labels:
                    start, stop, ent_type = ent
                    ent_type = ent_type.replace('_', '')
                    ne_class_list.add(ent_type)
                    if ent_type in ent_labels:
                        ent_labels[ent_type].append((start, stop))
                    else:
                        ent_labels[ent_type] = [(start, stop)]
                true_labels_for_testing.append(ent_labels)

                ent_labels = dict()
                for ent in offset_predicted_labels:
                    start, stop, ent_type = ent
                    ent_type = ent_type.replace('_', '')
                    if ent_type in ent_labels:
                        ent_labels[ent_type].append((start, stop))
                    else:
                        ent_labels[ent_type] = [(start, stop)]
                results_of_prediction.append(ent_labels)

    from eval.quality import calculate_prediction_quality
    f1, precision, recall, results = \
        calculate_prediction_quality(true_labels_for_testing,
                                     results_of_prediction,
                                     tuple(ne_class_list))
    print(f1, precision, recall, results)