Esempio n. 1
0
    def setUp(self):
        # Define S_matrix
        data_points = [SimpleNamespace(num=num) for num in DATA]
        applier = SFApplier([f, g])
        self.S = applier.apply(data_points, progress_bar=False)

        # Define base architecture
        self.hidden_dim = 10
        self.mlp = nn.Sequential(
            nn.Linear(2, self.hidden_dim),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(),
        )

        # Define model parameters
        self.data_name = "test_data"
        self.task_name = "test_task"

        # Define datasets
        # Repeated data value for [N x 2] dim Tensor
        self.X = torch.FloatTensor([(x, x) for x in DATA])
        # Alternating labels
        self.Y = torch.LongTensor([int(i % 2 == 0) for i in range(len(DATA))])

        dataset_name = "test_dataset"
        splits = ["train", "valid"]
        self.datasets = [
            create_dataset(self.X, self.Y, split, dataset_name, self.data_name,
                           self.task_name) for split in splits
        ]

        self.slice_model = SliceAwareClassifier(
            base_architecture=self.mlp,
            head_dim=self.hidden_dim,
            slice_names=[sf.name for sf in sfs],
            input_data_key=self.data_name,
            task_name=self.task_name,
            scorer=Scorer(metrics=["f1"]),
        )
Esempio n. 2
0
# * `head_dim`: identifies the final output feature dimension of the `base_architecture`
# * `slice_names`: Specify the slices that we plan to train on with this classifier.

# %%
from snorkel.slicing import SliceAwareClassifier
from utils import get_pytorch_mlp

# Define model architecture
bow_dim = X_train.shape[1]
hidden_dim = bow_dim
mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2)

# Initialize slice model
slice_model = SliceAwareClassifier(
    base_architecture=mlp,
    head_dim=hidden_dim,
    slice_names=[sf.name for sf in sfs],
    scorer=scorer,
)

# %% [markdown]
# Next, we'll generate the remaining `S` matrixes with the new set of slicing functions.

# %% {"tags": ["md-exclude-output"]}
applier = PandasSFApplier(sfs)
S_train = applier.apply(df_train)
S_test = applier.apply(df_test)

# %% [markdown]
# In order to train using slice information, we'd like to initialize a **slice-aware dataloader**.
# To do this, we can use [`slice_model.make_slice_dataloader`](https://snorkel.readthedocs.io/en/v0.9.3/packages/_autosummary/slicing/snorkel.slicing.SliceAwareClassifier.html#snorkel.slicing.SliceAwareClassifier.predict) to add slice labels to an existing dataloader.
#
Esempio n. 3
0
def slicing_evaluation(df_train, df_test, train_model=None):
    if train_model is None:
        train_model = "mlp"

    sfs = [
        SlicingFunction.short_comment, SlicingFunction.ind_keyword,
        SlicingFunction.cmp_re, SlicingFunction.industry_keyword
    ]

    slice_names = [sf.name for sf in sfs]
    scorer = Scorer(metrics=["f1"])

    ft = FT.load(f"{WORK_PATH}/snorkel_flow/sources/fasttext_name_model.bin")

    def get_ftr(text):
        return ft.get_sentence_vector(' '.join(
            [w for w in jieba.lcut(text.strip())]))

    X_train = np.array(list(df_train.text.apply(get_ftr).values))
    X_test = np.array(list(df_test.text.apply(get_ftr).values))
    Y_train = df_train.label.values
    Y_test = df_test.label.values

    if train_model == "lr":
        sklearn_model = LogisticRegression(C=0.001, solver="liblinear")
        sklearn_model.fit(X=X_train, y=Y_train)
        preds_test = sklearn_model.predict(X_test)
        probs_test = preds_to_probs(
            preds_test,
            len([c for c in dir(Polarity) if not c.startswith("__")]))
        print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%")
        applier = PandasSFApplier(sfs)
        S_test = applier.apply(df_test)
        analysis = scorer.score_slices(S=S_test,
                                       golds=Y_test,
                                       preds=preds_test,
                                       probs=probs_test,
                                       as_dataframe=True)
        return analysis

    if train_model == "mlp":
        # Define model architecture
        bow_dim = X_train.shape[1]
        hidden_dim = bow_dim
        mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2)

        # Initialize slice model
        slice_model = SliceAwareClassifier(
            base_architecture=mlp,
            head_dim=hidden_dim,
            slice_names=slice_names,
            scorer=scorer,
        )

        # generate the remaining S matrices with the new set of slicing functions
        applier = PandasSFApplier(sfs)
        S_train = applier.apply(df_train)
        S_test = applier.apply(df_test)

        # add slice labels to an existing dataloader
        BATCH_SIZE = 64

        train_dl = create_dict_dataloader(X_train, Y_train, "train")
        train_dl_slice = slice_model.make_slice_dataloader(
            train_dl.dataset, S_train, shuffle=True, batch_size=BATCH_SIZE)
        test_dl = create_dict_dataloader(X_test, Y_test, "train")
        test_dl_slice = slice_model.make_slice_dataloader(
            test_dl.dataset, S_test, shuffle=False, batch_size=BATCH_SIZE)

        #  fit our classifier with the training set dataloader
        trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True)
        trainer.fit(slice_model, [train_dl_slice])

        analysis = slice_model.score_slices([test_dl_slice], as_dataframe=True)
        return analysis
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = range(int(args.num_train_epochs))
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)

    if args.model_type == 'bert-slice-aware' or args.model_type == 'bert-slice-aware-random-slices':
        if args.model_type == 'bert-slice-aware':
            sfs = slicing_functions[args.task_name]
        elif args.model_type == 'bert-slice-aware-random-slices':
            if args.number_random_slices is None or args.size_random_slices is None:
                sfs = random_slicing_functions[args.task_name]
            else:
                sfs = args.sfs
        processor = slicing_processors[args.task_name]()
        examples_train = processor.get_train_examples(args.data_dir)

        snorkel_sf_applier = SFApplier(sfs)

        if os.path.isfile(args.data_dir + "/snorkel_slices_train.pickle"):
            with open(args.data_dir + "/snorkel_slices_train.pickle",
                      "rb") as f:
                logger.info("loaded cached pickle for sliced train.")
                snorkel_slices_train = pickle.load(f)
        else:
            snorkel_slices_train = snorkel_sf_applier.apply(examples_train)
            with open(args.data_dir + "/snorkel_slices_train.pickle",
                      "wb") as f:
                pickle.dump(snorkel_slices_train, f)
                logger.info("dumped pickle with sliced train.")

        snorkel_slices_with_ns = []
        for i, example in enumerate(examples_train):
            for _ in range(len(example.documents)):
                snorkel_slices_with_ns.append(snorkel_slices_train[i])

        snorkel_slices_with_ns_np = np.array(snorkel_slices_with_ns,
                                             dtype=snorkel_slices_train.dtype)

        slice_model = SliceAwareClassifier(
            task_name='labels',
            input_data_key='input_ids',
            base_architecture=model,
            head_dim=768,  #* args.max_seq_length,
            slice_names=[sf.name for sf in sfs])

        X_dict = {
            'input_ids': train_dataset.tensors[0],
            'attention_mask': train_dataset.tensors[1],
            'token_type_ids': train_dataset.tensors[2]
        }
        Y_dict = {'labels': train_dataset.tensors[3]}

        ds = DictDataset(name='labels',
                         split='train',
                         X_dict=X_dict,
                         Y_dict=Y_dict)
        train_dl_slice = slice_model.make_slice_dataloader(
            ds,
            snorkel_slices_with_ns_np,
            shuffle=True,
            batch_size=args.train_batch_size)

        trainer = Trainer(lr=args.learning_rate,
                          n_epochs=int(args.num_train_epochs),
                          l2=args.weight_decay,
                          optimizer="adamax",
                          max_steps=args.max_steps,
                          seed=args.seed)

        trainer.fit(slice_model, [train_dl_slice])
        model = slice_model
    else:
        for _ in train_iterator:
            epoch_iterator = train_dataloader
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[3]
                }
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if args.model_type in [
                        'bert', 'xlnet'
                    ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                if args.model_type == 'bert-mtl':
                    inputs["clf_head"] = 0
                outputs = model(**inputs)
                loss = outputs[
                    0]  # model outputs are always tuple in transformers (see doc)

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.local_rank in [
                            -1, 0
                    ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                            results = evaluate(args,
                                               model,
                                               tokenizer,
                                               sample_percentage=0.01)
                            for key, value in results.items():
                                tb_writer.add_scalar('eval_{}'.format(key),
                                                     value, global_step)
                                ex.log_scalar('eval_{}'.format(key), value,
                                              global_step)
                                logger.info('eval_{}'.format(key) + ": " +
                                            str(value) + ", step: " +
                                            str(global_step))
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                             args.logging_steps, global_step)
                        ex.log_scalar("lr", scheduler.get_lr()[0], global_step)
                        ex.log_scalar("loss", (tr_loss - logging_loss) /
                                      args.logging_steps, global_step)
                        logging_loss = tr_loss

                    if args.local_rank in [
                            -1, 0
                    ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir = os.path.join(
                            args.output_dir,
                            'checkpoint-{}'.format(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model_to_save = model.module if hasattr(
                            model, 'module'
                        ) else model  # Take care of distributed/parallel training
                        model_to_save.save_pretrained(output_dir)
                        torch.save(
                            args, os.path.join(output_dir,
                                               'training_args.bin'))
                        logger.info("Saving model checkpoint to %s",
                                    output_dir)

                if args.max_steps > 0 and global_step > args.max_steps:
                    break
                    # epoch_iterator.close()
            if args.max_steps > 0 and global_step > args.max_steps:
                break
                # train_iterator.close()

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return model
Esempio n. 5
0
class SliceCombinerTest(unittest.TestCase):
    def setUp(self):
        # Define S_matrix
        data_points = [SimpleNamespace(num=num) for num in DATA]
        applier = SFApplier([f, g])
        self.S = applier.apply(data_points, progress_bar=False)

        # Define base architecture
        self.hidden_dim = 10
        self.mlp = nn.Sequential(
            nn.Linear(2, self.hidden_dim),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(),
        )

        # Define model parameters
        self.data_name = "test_data"
        self.task_name = "test_task"

        # Define datasets
        # Repeated data value for [N x 2] dim Tensor
        self.X = torch.FloatTensor([(x, x) for x in DATA])
        # Alternating labels
        self.Y = torch.LongTensor([int(i % 2 == 0) for i in range(len(DATA))])

        dataset_name = "test_dataset"
        splits = ["train", "valid"]
        self.datasets = [
            create_dataset(self.X, self.Y, split, dataset_name, self.data_name,
                           self.task_name) for split in splits
        ]

        self.slice_model = SliceAwareClassifier(
            base_architecture=self.mlp,
            head_dim=self.hidden_dim,
            slice_names=[sf.name for sf in sfs],
            input_data_key=self.data_name,
            task_name=self.task_name,
            scorer=Scorer(metrics=["f1"]),
        )

    def test_slice_tasks(self):
        """Ensure that all the desired slice tasks are initialized."""

        expected_tasks = {
            # Base task
            "test_task",
            # Slice tasks for default base slice
            "test_task_slice:base_pred",
            "test_task_slice:base_ind",
            # Slice Tasks
            "test_task_slice:f_pred",
            "test_task_slice:f_ind",
            "test_task_slice:g_pred",
            "test_task_slice:g_ind",
        }
        self.assertEqual(self.slice_model.task_names, expected_tasks)

    def test_make_slice_dataloader(self):
        # Test correct construction
        dataloader = self.slice_model.make_slice_dataloader(
            dataset=self.datasets[0], S=self.S)
        Y_dict = dataloader.dataset.Y_dict
        self.assertEqual(len(Y_dict), 7)
        self.assertIn("test_task", Y_dict)
        self.assertIn("test_task_slice:base_pred", Y_dict)
        self.assertIn("test_task_slice:base_ind", Y_dict)
        self.assertIn("test_task_slice:f_pred", Y_dict)
        self.assertIn("test_task_slice:f_ind", Y_dict)
        self.assertIn("test_task_slice:g_pred", Y_dict)
        self.assertIn("test_task_slice:g_ind", Y_dict)

        # Test bad data input
        bad_data_dataset = DictDataset(
            name="test_data",
            split="train",
            X_dict={self.data_name: self.X},
            Y_dict={"bad_labels": self.Y},
        )
        with self.assertRaisesRegex(ValueError, "labels missing"):
            self.slice_model.make_slice_dataloader(dataset=bad_data_dataset,
                                                   S=self.S)

    def test_scores_pipeline(self):
        """Ensure that the appropriate scores are returned with .score and .score_slices."""
        # Make valid dataloader
        valid_dl = self.slice_model.make_slice_dataloader(
            dataset=self.datasets[1], S=self.S, batch_size=4)

        # Eval overall
        scores = self.slice_model.score([valid_dl])
        # All labels should appears in .score() output
        self.assertIn("test_task/test_dataset/valid/f1", scores)
        self.assertIn("test_task_slice:f_pred/test_dataset/valid/f1", scores)
        self.assertIn("test_task_slice:f_pred/test_dataset/valid/f1", scores)
        self.assertIn("test_task_slice:g_ind/test_dataset/valid/f1", scores)
        self.assertIn("test_task_slice:g_ind/test_dataset/valid/f1", scores)

        # Eval on slices
        slice_scores = self.slice_model.score_slices([valid_dl])
        # Check that we eval on 'pred' labels in .score_slices() output
        self.assertIn("test_task/test_dataset/valid/f1", slice_scores)
        self.assertIn("test_task_slice:f_pred/test_dataset/valid/f1",
                      slice_scores)
        self.assertIn("test_task_slice:g_pred/test_dataset/valid/f1",
                      slice_scores)

        # No 'ind' labels!
        self.assertNotIn("test_task_slice:f_ind/test_dataset/valid/f1",
                         slice_scores)
        self.assertNotIn("test_task_slice:g_ind/test_dataset/valid/f1",
                         slice_scores)