def setUp(self): # Define S_matrix data_points = [SimpleNamespace(num=num) for num in DATA] applier = SFApplier([f, g]) self.S = applier.apply(data_points, progress_bar=False) # Define base architecture self.hidden_dim = 10 self.mlp = nn.Sequential( nn.Linear(2, self.hidden_dim), nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU(), ) # Define model parameters self.data_name = "test_data" self.task_name = "test_task" # Define datasets # Repeated data value for [N x 2] dim Tensor self.X = torch.FloatTensor([(x, x) for x in DATA]) # Alternating labels self.Y = torch.LongTensor([int(i % 2 == 0) for i in range(len(DATA))]) dataset_name = "test_dataset" splits = ["train", "valid"] self.datasets = [ create_dataset(self.X, self.Y, split, dataset_name, self.data_name, self.task_name) for split in splits ] self.slice_model = SliceAwareClassifier( base_architecture=self.mlp, head_dim=self.hidden_dim, slice_names=[sf.name for sf in sfs], input_data_key=self.data_name, task_name=self.task_name, scorer=Scorer(metrics=["f1"]), )
# * `head_dim`: identifies the final output feature dimension of the `base_architecture` # * `slice_names`: Specify the slices that we plan to train on with this classifier. # %% from snorkel.slicing import SliceAwareClassifier from utils import get_pytorch_mlp # Define model architecture bow_dim = X_train.shape[1] hidden_dim = bow_dim mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2) # Initialize slice model slice_model = SliceAwareClassifier( base_architecture=mlp, head_dim=hidden_dim, slice_names=[sf.name for sf in sfs], scorer=scorer, ) # %% [markdown] # Next, we'll generate the remaining `S` matrixes with the new set of slicing functions. # %% {"tags": ["md-exclude-output"]} applier = PandasSFApplier(sfs) S_train = applier.apply(df_train) S_test = applier.apply(df_test) # %% [markdown] # In order to train using slice information, we'd like to initialize a **slice-aware dataloader**. # To do this, we can use [`slice_model.make_slice_dataloader`](https://snorkel.readthedocs.io/en/v0.9.3/packages/_autosummary/slicing/snorkel.slicing.SliceAwareClassifier.html#snorkel.slicing.SliceAwareClassifier.predict) to add slice labels to an existing dataloader. #
def slicing_evaluation(df_train, df_test, train_model=None): if train_model is None: train_model = "mlp" sfs = [ SlicingFunction.short_comment, SlicingFunction.ind_keyword, SlicingFunction.cmp_re, SlicingFunction.industry_keyword ] slice_names = [sf.name for sf in sfs] scorer = Scorer(metrics=["f1"]) ft = FT.load(f"{WORK_PATH}/snorkel_flow/sources/fasttext_name_model.bin") def get_ftr(text): return ft.get_sentence_vector(' '.join( [w for w in jieba.lcut(text.strip())])) X_train = np.array(list(df_train.text.apply(get_ftr).values)) X_test = np.array(list(df_test.text.apply(get_ftr).values)) Y_train = df_train.label.values Y_test = df_test.label.values if train_model == "lr": sklearn_model = LogisticRegression(C=0.001, solver="liblinear") sklearn_model.fit(X=X_train, y=Y_train) preds_test = sklearn_model.predict(X_test) probs_test = preds_to_probs( preds_test, len([c for c in dir(Polarity) if not c.startswith("__")])) print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%") applier = PandasSFApplier(sfs) S_test = applier.apply(df_test) analysis = scorer.score_slices(S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True) return analysis if train_model == "mlp": # Define model architecture bow_dim = X_train.shape[1] hidden_dim = bow_dim mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2) # Initialize slice model slice_model = SliceAwareClassifier( base_architecture=mlp, head_dim=hidden_dim, slice_names=slice_names, scorer=scorer, ) # generate the remaining S matrices with the new set of slicing functions applier = PandasSFApplier(sfs) S_train = applier.apply(df_train) S_test = applier.apply(df_test) # add slice labels to an existing dataloader BATCH_SIZE = 64 train_dl = create_dict_dataloader(X_train, Y_train, "train") train_dl_slice = slice_model.make_slice_dataloader( train_dl.dataset, S_train, shuffle=True, batch_size=BATCH_SIZE) test_dl = create_dict_dataloader(X_test, Y_test, "train") test_dl_slice = slice_model.make_slice_dataloader( test_dl.dataset, S_test, shuffle=False, batch_size=BATCH_SIZE) # fit our classifier with the training set dataloader trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True) trainer.fit(slice_model, [train_dl_slice]) analysis = slice_model.score_slices([test_dl_slice], as_dataframe=True) return analysis
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = range(int(args.num_train_epochs)) set_seed( args) # Added here for reproductibility (even between python 2 and 3) if args.model_type == 'bert-slice-aware' or args.model_type == 'bert-slice-aware-random-slices': if args.model_type == 'bert-slice-aware': sfs = slicing_functions[args.task_name] elif args.model_type == 'bert-slice-aware-random-slices': if args.number_random_slices is None or args.size_random_slices is None: sfs = random_slicing_functions[args.task_name] else: sfs = args.sfs processor = slicing_processors[args.task_name]() examples_train = processor.get_train_examples(args.data_dir) snorkel_sf_applier = SFApplier(sfs) if os.path.isfile(args.data_dir + "/snorkel_slices_train.pickle"): with open(args.data_dir + "/snorkel_slices_train.pickle", "rb") as f: logger.info("loaded cached pickle for sliced train.") snorkel_slices_train = pickle.load(f) else: snorkel_slices_train = snorkel_sf_applier.apply(examples_train) with open(args.data_dir + "/snorkel_slices_train.pickle", "wb") as f: pickle.dump(snorkel_slices_train, f) logger.info("dumped pickle with sliced train.") snorkel_slices_with_ns = [] for i, example in enumerate(examples_train): for _ in range(len(example.documents)): snorkel_slices_with_ns.append(snorkel_slices_train[i]) snorkel_slices_with_ns_np = np.array(snorkel_slices_with_ns, dtype=snorkel_slices_train.dtype) slice_model = SliceAwareClassifier( task_name='labels', input_data_key='input_ids', base_architecture=model, head_dim=768, #* args.max_seq_length, slice_names=[sf.name for sf in sfs]) X_dict = { 'input_ids': train_dataset.tensors[0], 'attention_mask': train_dataset.tensors[1], 'token_type_ids': train_dataset.tensors[2] } Y_dict = {'labels': train_dataset.tensors[3]} ds = DictDataset(name='labels', split='train', X_dict=X_dict, Y_dict=Y_dict) train_dl_slice = slice_model.make_slice_dataloader( ds, snorkel_slices_with_ns_np, shuffle=True, batch_size=args.train_batch_size) trainer = Trainer(lr=args.learning_rate, n_epochs=int(args.num_train_epochs), l2=args.weight_decay, optimizer="adamax", max_steps=args.max_steps, seed=args.seed) trainer.fit(slice_model, [train_dl_slice]) model = slice_model else: for _ in train_iterator: epoch_iterator = train_dataloader for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids if args.model_type == 'bert-mtl': inputs["clf_head"] = 0 outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, sample_percentage=0.01) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) ex.log_scalar('eval_{}'.format(key), value, global_step) logger.info('eval_{}'.format(key) + ": " + str(value) + ", step: " + str(global_step)) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) ex.log_scalar("lr", scheduler.get_lr()[0], global_step) ex.log_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: break # epoch_iterator.close() if args.max_steps > 0 and global_step > args.max_steps: break # train_iterator.close() if args.local_rank in [-1, 0]: tb_writer.close() return model
class SliceCombinerTest(unittest.TestCase): def setUp(self): # Define S_matrix data_points = [SimpleNamespace(num=num) for num in DATA] applier = SFApplier([f, g]) self.S = applier.apply(data_points, progress_bar=False) # Define base architecture self.hidden_dim = 10 self.mlp = nn.Sequential( nn.Linear(2, self.hidden_dim), nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU(), ) # Define model parameters self.data_name = "test_data" self.task_name = "test_task" # Define datasets # Repeated data value for [N x 2] dim Tensor self.X = torch.FloatTensor([(x, x) for x in DATA]) # Alternating labels self.Y = torch.LongTensor([int(i % 2 == 0) for i in range(len(DATA))]) dataset_name = "test_dataset" splits = ["train", "valid"] self.datasets = [ create_dataset(self.X, self.Y, split, dataset_name, self.data_name, self.task_name) for split in splits ] self.slice_model = SliceAwareClassifier( base_architecture=self.mlp, head_dim=self.hidden_dim, slice_names=[sf.name for sf in sfs], input_data_key=self.data_name, task_name=self.task_name, scorer=Scorer(metrics=["f1"]), ) def test_slice_tasks(self): """Ensure that all the desired slice tasks are initialized.""" expected_tasks = { # Base task "test_task", # Slice tasks for default base slice "test_task_slice:base_pred", "test_task_slice:base_ind", # Slice Tasks "test_task_slice:f_pred", "test_task_slice:f_ind", "test_task_slice:g_pred", "test_task_slice:g_ind", } self.assertEqual(self.slice_model.task_names, expected_tasks) def test_make_slice_dataloader(self): # Test correct construction dataloader = self.slice_model.make_slice_dataloader( dataset=self.datasets[0], S=self.S) Y_dict = dataloader.dataset.Y_dict self.assertEqual(len(Y_dict), 7) self.assertIn("test_task", Y_dict) self.assertIn("test_task_slice:base_pred", Y_dict) self.assertIn("test_task_slice:base_ind", Y_dict) self.assertIn("test_task_slice:f_pred", Y_dict) self.assertIn("test_task_slice:f_ind", Y_dict) self.assertIn("test_task_slice:g_pred", Y_dict) self.assertIn("test_task_slice:g_ind", Y_dict) # Test bad data input bad_data_dataset = DictDataset( name="test_data", split="train", X_dict={self.data_name: self.X}, Y_dict={"bad_labels": self.Y}, ) with self.assertRaisesRegex(ValueError, "labels missing"): self.slice_model.make_slice_dataloader(dataset=bad_data_dataset, S=self.S) def test_scores_pipeline(self): """Ensure that the appropriate scores are returned with .score and .score_slices.""" # Make valid dataloader valid_dl = self.slice_model.make_slice_dataloader( dataset=self.datasets[1], S=self.S, batch_size=4) # Eval overall scores = self.slice_model.score([valid_dl]) # All labels should appears in .score() output self.assertIn("test_task/test_dataset/valid/f1", scores) self.assertIn("test_task_slice:f_pred/test_dataset/valid/f1", scores) self.assertIn("test_task_slice:f_pred/test_dataset/valid/f1", scores) self.assertIn("test_task_slice:g_ind/test_dataset/valid/f1", scores) self.assertIn("test_task_slice:g_ind/test_dataset/valid/f1", scores) # Eval on slices slice_scores = self.slice_model.score_slices([valid_dl]) # Check that we eval on 'pred' labels in .score_slices() output self.assertIn("test_task/test_dataset/valid/f1", slice_scores) self.assertIn("test_task_slice:f_pred/test_dataset/valid/f1", slice_scores) self.assertIn("test_task_slice:g_pred/test_dataset/valid/f1", slice_scores) # No 'ind' labels! self.assertNotIn("test_task_slice:f_ind/test_dataset/valid/f1", slice_scores) self.assertNotIn("test_task_slice:g_ind/test_dataset/valid/f1", slice_scores)