def predict(self, data, thresh):
     pbar = ProgressBar(n_total=len(data))
     all_logits = None
     # y_true = torch.LongTensor()
     y_true = None
     self.model.eval()
     with torch.no_grad():
         for step, batch in enumerate(data):
             batch = tuple(t.to(self.device) for t in batch)
             input_ids, input_mask, segment_ids, label_ids = batch
             # y_true = torch.cat((y_true, label_ids), 0)
             if y_true is None:
                 y_true = label_ids.detach().cpu().numpy()
             else:
                 y_true = np.concatenate(
                     [y_true, label_ids.detach().cpu().numpy()], axis=0)
             logits = self.model(input_ids, segment_ids, input_mask)
             logits = logits.sigmoid()
             if all_logits is None:
                 all_logits = logits.detach().cpu().numpy()
             else:
                 all_logits = np.concatenate(
                     [all_logits, logits.detach().cpu().numpy()], axis=0)
             pbar.batch_step(step=step, info={}, bar_type='Testing')
     y_pred = (all_logits > thresh) * 1
     micro = f1_score(y_true, y_pred, average='micro')
     macro = f1_score(y_true, y_pred, average='macro')
     score = (micro + macro) / 2
     self.logger.info("\nScore: micro {}, macro {} Average {}".format(
         micro, macro, score))
     if 'cuda' in str(self.device):
         torch.cuda.empty_cache()
     return all_logits, y_pred
コード例 #2
0
 def valid_epoch(self, data):
     pbar = ProgressBar(n_total=len(data))
     self.epoch_reset()
     self.model.eval()
     with torch.no_grad():
         for step, batch in enumerate(data):
             batch = tuple(t.to(self.device) for t in batch)
             input_ids, input_mask, segment_ids, label_ids = batch
             logits = self.model(input_ids, input_mask, segment_ids)
             self.outputs.append(logits.cpu().detach())
             self.targets.append(label_ids.cpu().detach())
             pbar.batch_step(step=step, info={}, bar_type='Evaluating')
         self.outputs = torch.cat(self.outputs, dim=0).cpu().detach()
         self.targets = torch.cat(self.targets, dim=0).cpu().detach()
         loss = self.criterion(target=self.targets, output=self.outputs)
         self.result['valid_loss'] = loss.item()
         print("------------- valid result --------------")
         if self.epoch_metrics:
             for metric in self.epoch_metrics:
                 metric(logits=self.outputs, target=self.targets)
                 value = metric.value()
                 if value:
                     self.result[f'valid_{metric.name()}'] = value
         if 'cuda' in str(self.device):
             torch.cuda.empty_cache()
         return self.result
コード例 #3
0
 def create_examples(self, lines, example_type, cached_examples_file):
     '''
     Creates examples for data
     '''
     pbar = ProgressBar(n_total=len(lines))
     if cached_examples_file.exists():
         logger.info("Loading examples from cached file %s",
                     cached_examples_file)
         examples = torch.load(cached_examples_file)
     else:
         examples = []
         for i, line in enumerate(lines):
             guid = '%s-%d' % (example_type, i)
             text_a = line[0]
             label = line[1]
             if isinstance(label, str):
                 label = [np.float(x) for x in label.split(",")]
             else:
                 label = [np.float(x) for x in list(label)]
             text_b = None
             example = InputExample(guid=guid,
                                    text_a=text_a,
                                    text_b=text_b,
                                    label=label)
             examples.append(example)
             pbar.batch_step(step=i, info={}, bar_type='create examples')
         logger.info("Saving examples into cached file %s",
                     cached_examples_file)
         torch.save(examples, cached_examples_file)
     return examples
コード例 #4
0
 def train_epoch(self, data):
     pbar = ProgressBar(n_total=len(data))
     tr_loss = AverageMeter()
     self.epoch_reset()
     for step, batch in enumerate(data):
         self.batch_reset()
         self.model.train()
         batch = tuple(t.to(self.device) for t in batch)
         input_ids, input_mask, segment_ids, label_ids = batch
         print("input_ids, input_mask, segment_ids, label_ids SIZE: \n")
         print(input_ids.size(), input_mask.size(), segment_ids.size(),
               label_ids.size())
         logits = self.model(input_ids, input_mask, segment_ids)
         print("logits and label ids size: ", logits.size(),
               label_ids.size())
         loss = self.criterion(output=logits, target=label_ids)
         if len(self.n_gpu) >= 2:
             loss = loss.mean()
         if self.gradient_accumulation_steps > 1:
             loss = loss / self.gradient_accumulation_steps
         if self.fp16:
             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                 scaled_loss.backward()
             clip_grad_norm_(amp.master_params(self.optimizer),
                             self.grad_clip)
         else:
             loss.backward()
             clip_grad_norm_(self.model.parameters(), self.grad_clip)
         if (step + 1) % self.gradient_accumulation_steps == 0:
             self.lr_scheduler.step()
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.global_step += 1
         if self.batch_metrics:
             for metric in self.batch_metrics:
                 metric(logits=logits, target=label_ids)
                 self.info[metric.name()] = metric.value()
         self.info['loss'] = loss.item()
         tr_loss.update(loss.item(), n=1)
         if self.verbose >= 1:
             pbar.batch_step(step=step, info=self.info, bar_type='Training')
         self.outputs.append(logits.cpu().detach())
         self.targets.append(label_ids.cpu().detach())
     print("\n------------- train result --------------")
     # epoch metric
     self.outputs = torch.cat(self.outputs, dim=0).cpu().detach()
     self.targets = torch.cat(self.targets, dim=0).cpu().detach()
     self.result['loss'] = tr_loss.avg
     if self.epoch_metrics:
         for metric in self.epoch_metrics:
             metric(logits=self.outputs, target=self.targets)
             value = metric.value()
             if value:
                 self.result[f'{metric.name()}'] = value
     if "cuda" in str(self.device):
         torch.cuda.empty_cache()
     return self.result
    def create_features(self, examples, max_seq_len, cached_features_file):
        pbar = ProgressBar(n_total=len(examples))
        if cached_features_file.exists():
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            features = torch.load(cached_features_file)
        else:
            features = []
            for ex_id, example in enumerate(examples):
                tokens = self.tokenizer.tokenize(example.text)
                label_ids = example.labels

                if len(tokens) > max_seq_len:
                    tokens = tokens[:max_seq_len]

                input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                padding = [self.pad_id] * (max_seq_len - len(input_ids))
                input_len = len(input_ids)

                input_ids += padding

                assert len(input_ids) == max_seq_len

                if ex_id < 2:
                    logger.info("*** Example ***")
                    logger.info(f"guid: {example.guid}" % ())
                    logger.info(
                        f"tokens: {' '.join([str(x) for x in tokens])}")
                    logger.info(
                        f"input_ids: {' '.join([str(x) for x in input_ids])}")

                feature = InputFeature(input_ids=input_ids,
                                       label_ids=label_ids,
                                       input_len=input_len)
                features.append(feature)
                pbar.batch_step(step=ex_id,
                                info={},
                                bar_type='create features')
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)
        return features
コード例 #6
0
    def create_features(self, examples, max_seq_len, cached_features_file):
        '''
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        '''
        pbar = ProgressBar(n_total=len(examples))
        if cached_features_file.exists():
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            features = torch.load(cached_features_file)
        else:
            features = []
            for ex_id, example in enumerate(examples):
                tokens_a = self.tokenizer.tokenize(example.text_a)
                tokens_b = None
                label_id = example.label

                if example.text_b:
                    tokens_b = self.tokenizer.tokenize(example.text_b)
                    # Modifies `tokens_a` and `tokens_b` in place
                    # so that the total
                    # length is less than the specified length.
                    # Account for [CLS], [SEP], [SEP] with "- 3"
                    self.truncate_seq_pair(tokens_a,
                                           tokens_b,
                                           max_length=max_seq_len - 3)
                else:
                    # Account for [CLS] and [SEP] with '-2'
                    if len(tokens_a) > max_seq_len - 2:
                        tokens_a = tokens_a[:max_seq_len - 2]
                tokens = ['[CLS]'] + tokens_a + ['[SEP]']
                segment_ids = [0] * len(tokens)
                if tokens_b:
                    tokens += tokens_b + ['[SEP]']
                    segment_ids += [1] * (len(tokens_b) + 1)
                input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                input_mask = [1] * len(input_ids)
                padding = [0] * (max_seq_len - len(input_ids))
                input_len = len(input_ids)

                input_ids += padding
                input_mask += padding
                segment_ids += padding

                assert len(input_ids) == max_seq_len
                assert len(input_mask) == max_seq_len
                assert len(segment_ids) == max_seq_len

                if ex_id < 2:
                    logger.info("*** Example ***")
                    logger.info(f"guid: {example.guid}" % ())
                    logger.info(
                        f"tokens: {' '.join([str(x) for x in tokens])}")
                    logger.info(
                        f"input_ids: {' '.join([str(x) for x in input_ids])}")
                    logger.info(
                        f"input_mask: {' '.join([str(x) for x in input_mask])}"
                    )
                    logger.info(
                        f"segment_ids: {' '.join([str(x) for x in segment_ids])}"
                    )

                feature = InputFeature(input_ids=input_ids,
                                       input_mask=input_mask,
                                       segment_ids=segment_ids,
                                       label_id=label_id,
                                       input_len=input_len)
                features.append(feature)
                pbar.batch_step(step=ex_id,
                                info={},
                                bar_type='create features')
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)
        return features