def predict(self, data, thresh): pbar = ProgressBar(n_total=len(data)) all_logits = None # y_true = torch.LongTensor() y_true = None self.model.eval() with torch.no_grad(): for step, batch in enumerate(data): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # y_true = torch.cat((y_true, label_ids), 0) if y_true is None: y_true = label_ids.detach().cpu().numpy() else: y_true = np.concatenate( [y_true, label_ids.detach().cpu().numpy()], axis=0) logits = self.model(input_ids, segment_ids, input_mask) logits = logits.sigmoid() if all_logits is None: all_logits = logits.detach().cpu().numpy() else: all_logits = np.concatenate( [all_logits, logits.detach().cpu().numpy()], axis=0) pbar.batch_step(step=step, info={}, bar_type='Testing') y_pred = (all_logits > thresh) * 1 micro = f1_score(y_true, y_pred, average='micro') macro = f1_score(y_true, y_pred, average='macro') score = (micro + macro) / 2 self.logger.info("\nScore: micro {}, macro {} Average {}".format( micro, macro, score)) if 'cuda' in str(self.device): torch.cuda.empty_cache() return all_logits, y_pred
def valid_epoch(self, data): pbar = ProgressBar(n_total=len(data)) self.epoch_reset() self.model.eval() with torch.no_grad(): for step, batch in enumerate(data): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = self.model(input_ids, input_mask, segment_ids) self.outputs.append(logits.cpu().detach()) self.targets.append(label_ids.cpu().detach()) pbar.batch_step(step=step, info={}, bar_type='Evaluating') self.outputs = torch.cat(self.outputs, dim=0).cpu().detach() self.targets = torch.cat(self.targets, dim=0).cpu().detach() loss = self.criterion(target=self.targets, output=self.outputs) self.result['valid_loss'] = loss.item() print("------------- valid result --------------") if self.epoch_metrics: for metric in self.epoch_metrics: metric(logits=self.outputs, target=self.targets) value = metric.value() if value: self.result[f'valid_{metric.name()}'] = value if 'cuda' in str(self.device): torch.cuda.empty_cache() return self.result
def create_examples(self, lines, example_type, cached_examples_file): ''' Creates examples for data ''' pbar = ProgressBar(n_total=len(lines)) if cached_examples_file.exists(): logger.info("Loading examples from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: examples = [] for i, line in enumerate(lines): guid = '%s-%d' % (example_type, i) text_a = line[0] label = line[1] if isinstance(label, str): label = [np.float(x) for x in label.split(",")] else: label = [np.float(x) for x in list(label)] text_b = None example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) pbar.batch_step(step=i, info={}, bar_type='create examples') logger.info("Saving examples into cached file %s", cached_examples_file) torch.save(examples, cached_examples_file) return examples
def train_epoch(self, data): pbar = ProgressBar(n_total=len(data)) tr_loss = AverageMeter() self.epoch_reset() for step, batch in enumerate(data): self.batch_reset() self.model.train() batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch print("input_ids, input_mask, segment_ids, label_ids SIZE: \n") print(input_ids.size(), input_mask.size(), segment_ids.size(), label_ids.size()) logits = self.model(input_ids, input_mask, segment_ids) print("logits and label ids size: ", logits.size(), label_ids.size()) loss = self.criterion(output=logits, target=label_ids) if len(self.n_gpu) >= 2: loss = loss.mean() if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps if self.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() clip_grad_norm_(amp.master_params(self.optimizer), self.grad_clip) else: loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_clip) if (step + 1) % self.gradient_accumulation_steps == 0: self.lr_scheduler.step() self.optimizer.step() self.optimizer.zero_grad() self.global_step += 1 if self.batch_metrics: for metric in self.batch_metrics: metric(logits=logits, target=label_ids) self.info[metric.name()] = metric.value() self.info['loss'] = loss.item() tr_loss.update(loss.item(), n=1) if self.verbose >= 1: pbar.batch_step(step=step, info=self.info, bar_type='Training') self.outputs.append(logits.cpu().detach()) self.targets.append(label_ids.cpu().detach()) print("\n------------- train result --------------") # epoch metric self.outputs = torch.cat(self.outputs, dim=0).cpu().detach() self.targets = torch.cat(self.targets, dim=0).cpu().detach() self.result['loss'] = tr_loss.avg if self.epoch_metrics: for metric in self.epoch_metrics: metric(logits=self.outputs, target=self.targets) value = metric.value() if value: self.result[f'{metric.name()}'] = value if "cuda" in str(self.device): torch.cuda.empty_cache() return self.result
def create_features(self, examples, max_seq_len, cached_features_file): pbar = ProgressBar(n_total=len(examples)) if cached_features_file.exists(): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: features = [] for ex_id, example in enumerate(examples): tokens = self.tokenizer.tokenize(example.text) label_ids = example.labels if len(tokens) > max_seq_len: tokens = tokens[:max_seq_len] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) padding = [self.pad_id] * (max_seq_len - len(input_ids)) input_len = len(input_ids) input_ids += padding assert len(input_ids) == max_seq_len if ex_id < 2: logger.info("*** Example ***") logger.info(f"guid: {example.guid}" % ()) logger.info( f"tokens: {' '.join([str(x) for x in tokens])}") logger.info( f"input_ids: {' '.join([str(x) for x in input_ids])}") feature = InputFeature(input_ids=input_ids, label_ids=label_ids, input_len=input_len) features.append(feature) pbar.batch_step(step=ex_id, info={}, bar_type='create features') logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) return features
def create_features(self, examples, max_seq_len, cached_features_file): ''' # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 ''' pbar = ProgressBar(n_total=len(examples)) if cached_features_file.exists(): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: features = [] for ex_id, example in enumerate(examples): tokens_a = self.tokenizer.tokenize(example.text_a) tokens_b = None label_id = example.label if example.text_b: tokens_b = self.tokenizer.tokenize(example.text_b) # Modifies `tokens_a` and `tokens_b` in place # so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self.truncate_seq_pair(tokens_a, tokens_b, max_length=max_seq_len - 3) else: # Account for [CLS] and [SEP] with '-2' if len(tokens_a) > max_seq_len - 2: tokens_a = tokens_a[:max_seq_len - 2] tokens = ['[CLS]'] + tokens_a + ['[SEP]'] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + ['[SEP]'] segment_ids += [1] * (len(tokens_b) + 1) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) padding = [0] * (max_seq_len - len(input_ids)) input_len = len(input_ids) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_len assert len(input_mask) == max_seq_len assert len(segment_ids) == max_seq_len if ex_id < 2: logger.info("*** Example ***") logger.info(f"guid: {example.guid}" % ()) logger.info( f"tokens: {' '.join([str(x) for x in tokens])}") logger.info( f"input_ids: {' '.join([str(x) for x in input_ids])}") logger.info( f"input_mask: {' '.join([str(x) for x in input_mask])}" ) logger.info( f"segment_ids: {' '.join([str(x) for x in segment_ids])}" ) feature = InputFeature(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, input_len=input_len) features.append(feature) pbar.batch_step(step=ex_id, info={}, bar_type='create features') logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) return features