Example #1
0
    def test_shapes(self, batch_size=1):
        models = utils.build_all_models()
        vocab = utils.build_sample_vocab()
        tag_vocab = utils.build_sample_tag_vocab()
        train_dataset = utils.construct_sample_dataloader()

        data_loader = conlldataloader.get_data_loader(
            vocab,
            tag_vocab,
            train_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=0,
        )

        def _convert(item: object) -> torch.Tensor:
            if isinstance(item, torch.Tensor):
                return item
            else:
                return torch.Tensor(item)

        for batch_i, (s_ids, x, x_chars, y, weight) in enumerate(data_loader):
            outs = [_convert(model(x, x_chars)) for model in models]
            for i in range(1, len(outs)):
                # all models should output the same dimensions
                assert outs[i].shape == outs[i - 1].shape
Example #2
0
    def _test_single_model_eval(self, model: nn.Module):
        vocab = utils.build_sample_vocab()
        tag_vocab = utils.build_sample_tag_vocab()
        train_dataset = utils.construct_sample_dataloader()
        data_loader = conlldataloader.get_data_loader(
            vocab,
            tag_vocab,
            train_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=0,
        )
        f1_data, _ = ner_utils.compute_f1_dataloader(
            model=model,
            data_loader=data_loader,
            tag_vocab=tag_vocab,
        )

        ner_utils.estimate_f1(
            model=model,
            data_loader=data_loader,
            tag_vocab=tag_vocab,
            threshold=1,
        )

        # compute average
        avg = ner_utils.compute_avg_f1(f1_data)
Example #3
0
    def _test_single_model_train(self, model: nn.Module):
        vocab = utils.build_sample_vocab()
        tag_vocab = utils.build_sample_tag_vocab()
        train_dataset = utils.construct_sample_dataloader()

        data_loader = conlldataloader.get_data_loader(
            vocab,
            tag_vocab,
            train_dataset,
            batch_size=4,
            shuffle=False,
            num_workers=0,
        )

        if not isinstance(model, dictionary_model.DictionaryModel):
            optim = torch.optim.SGD(model.parameters(),
                                    lr=0.01,
                                    weight_decay=1e-9)
        else:
            optim = None

        # iterate over epochs
        for e in range(1):
            loss_sum = 0

            with tqdm(data_loader) as pbar:
                for i, (s_ids, x, x_chars, y, weight) in enumerate(pbar):
                    if isinstance(model, dictionary_model.DictionaryModel):
                        model.add_example(x.long(), y.long())
                        continue
                    model.zero_grad()
                    model_loss = model.compute_mle(x, x_chars, y)
                    loss = torch.mean(model_loss)
                    loss.backward()  # backpropogate
                    optim.step()  # update parameters
Example #4
0
    def force_eval(self):
        f1_data = ner.utils.compute_f1_dataloader(
            self.model,
            conlldataloader.get_data_loader(
                self.vocab,
                self.tag_vocab,
                self.test_data,
                1,
                False,
                0,
                label_fn=lambda data, index: (data[index][0], data[index][1]),
            ),
            tag_vocab=self.tag_vocab,
        )

        return f1_data
Example #5
0
    def test_one_hot_iteration(self):
        '''
        A test to ensure one hot iteration occurs with no problem
        '''
        vocab = utils.build_sample_vocab()
        tag_vocab = utils.build_sample_tag_vocab()
        train_dataset = utils.construct_sample_dataloader()


        data_loader = conlldataloader.get_data_loader(
            vocab,
            tag_vocab,
            train_dataset,
            batch_size=4,
            shuffle=False,
            num_workers=0,
            one_hot=True
        )

        for i, entry in enumerate(data_loader):
            # ensure iteration works
            pass
Example #6
0
    def test_elmo_from_raw_sentence(self):
        vocab = utils.build_sample_vocab()
        tag_vocab = utils.build_sample_tag_vocab()
        train_dataset = utils.construct_sample_dataloader()
        embedding_dim = 4
        hidden_dim = 4
        batch_size = 4

        elmo_model = elmo_bilstm_crf.ELMo_BiLSTM_CRF(vocab, tag_vocab,
                                                     hidden_dim, batch_size)

        data_loader = conlldataloader.get_data_loader(
            vocab,
            tag_vocab,
            train_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=0,
        )

        for batch_i, (s_ids, x, x_chars, y, weight) in enumerate(data_loader):
            # assure no errors in elmo reconstruction
            elmo_model(x, None)
            break
Example #7
0
    def test_analyze(self):
        models = utils.build_all_models()
        vocab = utils.build_sample_vocab()
        tag_vocab = utils.build_sample_tag_vocab()
        train_dataset = utils.construct_sample_dataloader()

        data_loader = conlldataloader.get_data_loader(
            vocab,
            tag_vocab,
            train_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=0,
        )

        for model in models:
            ner_utils.analyze_predictions(
                model=model,
                data_loader=data_loader,
                vocab=vocab,
                tag_vocab=tag_vocab,
                log_file=utils.MockFile(),
                csv_file=None,  # no need to output to csv
            )
Example #8
0
def load_session_data(session_dir: str, ner_class: str):
    tag_vocab = ner_vocab.build_output_vocab(
        [f'B-{ner_class}', f'I-{ner_class}', 'O'])
    with open(os.path.join(session_dir, "vocab.pkl"), 'rb') as f:
        vocab = pickle.load(f)
    with open(os.path.join(session_dir, "entry_to_sentences.pkl"), 'rb') as f:
        entry_to_sentences = pickle.load(f)
    with open(os.path.join(session_dir, "database.pkl"), 'rb') as f:
        database = pickle.load(f)

    users = get_users(session_dir)

    gold_data = SCIERCDataset(os.path.join(session_dir, "gold_set.txt"))
    gold_data.parse_file()

    user_data = {}

    with open('output.csv', 'w') as f:
        csv_writer = csv.writer(f, delimiter=',')

        row_header = [
            'user_name',
            'train_f1',
            'train_prec',
            'train_rec',
            'train_acc',  # train metrics
            'valid_f1',
            'valid_prec',
            'valid_rec',
            'valid_acc',  # valid metrics
            'gold_f1',
            'gold_prec',
            'gold_rec',
            'gold_acc',  # gold metrics
        ]

        csv_writer.writerow(row_header)

        for user_name in users:
            model = load_model(
                vocab=vocab,
                tag_vocab=tag_vocab,
                file_name=os.path.join(session_dir, user_name, "model.ckpt"),
                session_dir=session_dir,
            )

            if model is None:
                continue

            train_dataset, valid_dataset = get_data_loaders(
                session_dir=session_dir, user_name=user_name)

            row_data = []
            for dataset, dataset_name in zip(
                [train_dataset, valid_dataset, gold_data],
                ['train', 'valid', 'gold']):
                data_loader = conlldataloader.get_data_loader(
                    vocab,
                    tag_vocab,
                    dataset,
                    1,
                    False,
                    0,
                )

                f1_data, acc = compute_f1_data(model, data_loader, tag_vocab)
                user_data[user_name] = {
                    'model': model,
                    'f1_data': f1_data,
                    'acc': acc,
                }

                row_data.extend([
                    f1_data[ner_class]['f1'],
                    f1_data[ner_class]['precision'],
                    f1_data[ner_class]['recall'],
                    acc,
                ])

            csv_writer.writerow(row_data)

            ner.utils.print_f1_summary(f1_data)

    return user_data
Example #9
0
def train(model, train_data, vocab, tag_vocab):
    trainer = ner.trainer.Trainer(
        model=model,
        learning_rate=0.01,
        weight_decay=1e-4,
        momentum=0,
        optimizer_type='SGD',
        vocab=vocab,
        tags=tag_vocab,
        batch_size=1,
        shuffle=True,
        num_workers=0,
        train_dataset=train_data,
        logger=None,
        device='cpu',
        verbose_print=True,
        verbose_log=False,
        test_dataset=[],
        train_label_fn=lambda data, index:
        (data[index][0], data[index][1][0], data[index][1][1]),
        test_label_fn=lambda data, index:
        (data[index][0], data[index][1][0], data[index][1][1]),
        epoch_comparator=None,
    )

    train_data_loader = conlldataloader.get_data_loader(
        vocab,
        tag_vocab,
        train_data,
        1,
        False,
        0,
        label_fn=lambda data, index:
        (data[index][0], data[index][1][0], data[index][1][1]),
    )
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.01,
                                weight_decay=1e-9,
                                momentum=0)
    loss_sum = 0.0
    # model.train()
    print('starting epoch!!!!!!!')
    with tqdm(train_data_loader) as pbar:
        for i, (s_ids, x, x_chars, y) in enumerate(pbar):

            print(f'at iteration step 2: {i}')
            model.zero_grad()
            print(f'lets go forward!: {i}')
            model_loss = model.compute_mle(x, x_chars, y, s_ids=s_ids)
            print(f'loss iteration: {i}')
            loss = torch.mean(model_loss)
            loss.backward()  # backpropogate
            # loss = torch.mean(model_loss)
            optimizer.step()  # update parameters
            loss_sum += loss.item()

            # update TQDM bar
            pbar.set_postfix(loss_avg=loss_sum / (i + 1),
                             epoch="{}/{}".format(0 + 1, 1))

            pbar.refresh()
            print(f'finishing iteration: {i}')

        # model.eval()
        # model.train()
        loss_sum /= len(train_data_loader)