Beispiel #1
0
    def evaluate_dataloader(self,
                            data: DataLoader,
                            criterion: Callable,
                            metric: MetricDict = None,
                            output=False,
                            logger=None,
                            ratio_width=None,
                            **kwargs):

        metric.reset()
        self.model.eval()
        timer = CountdownTimer(len(data))
        total_loss = 0
        for idx, batch in enumerate(data):
            out, mask = self.feed_batch(batch)
            loss = out['loss']
            total_loss += loss.item()
            self.decode_output(out, mask, batch)
            self.update_metrics(metric, batch, out, mask)
            report = f'loss: {total_loss / (idx + 1):.4f} {metric.cstr()}'
            timer.log(report,
                      logger=logger,
                      ratio_percentage=False,
                      ratio_width=ratio_width)
            del loss
            del out
            del mask
        return total_loss / len(data), metric
Beispiel #2
0
 def build_dataloader(self,
                      data,
                      transform: Callable = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      cache=False,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     dataset = CRFConstituencyParsing.build_dataset(self, data, transform)
     if isinstance(data, str):
         dataset.purge_cache()
     if self.vocabs.mutable:
         CRFConstituencyParsing.build_vocabs(self, dataset, logger)
     if dataset.cache:
         timer = CountdownTimer(len(dataset))
         # noinspection PyCallByClass
         BiaffineDependencyParser.cache_dataset(self, dataset, timer,
                                                training, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset)
Beispiel #3
0
 def evaluate_dataloader(self,
                         data: DataLoader,
                         criterion: Callable,
                         metric,
                         logger,
                         ratio_width=None,
                         output=False,
                         official=False,
                         confusion_matrix=False,
                         **kwargs):
     self.model.eval()
     self.reset_metrics(metric)
     timer = CountdownTimer(len(data))
     total_loss = 0
     if official:
         sentences = []
         gold = []
         pred = []
     for batch in data:
         output_dict = self.feed_batch(batch)
         if official:
             sentences += batch['token']
             gold += batch['srl']
             pred += output_dict['prediction']
         self.update_metrics(batch, output_dict, metric)
         loss = output_dict['loss']
         total_loss += loss.item()
         timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                   logger=logger,
                   ratio_width=ratio_width)
         del loss
     if official:
         scores = compute_srl_f1(sentences, gold, pred)
         if logger:
             if confusion_matrix:
                 labels = sorted(set(y for x in scores.label_confusions.keys() for y in x))
                 headings = ['GOLD↓PRED→'] + labels
                 matrix = []
                 for i, gold in enumerate(labels):
                     row = [gold]
                     matrix.append(row)
                     for j, pred in enumerate(labels):
                         row.append(scores.label_confusions.get((gold, pred), 0))
                 matrix = markdown_table(headings, matrix)
                 logger.info(f'{"Confusion Matrix": ^{len(matrix.splitlines()[0])}}')
                 logger.info(matrix)
             headings = ['Settings', 'Precision', 'Recall', 'F1']
             data = []
             for h, (p, r, f) in zip(['Unlabeled', 'Labeled', 'Official'], [
                 [scores.unlabeled_precision, scores.unlabeled_recall, scores.unlabeled_f1],
                 [scores.precision, scores.recall, scores.f1],
                 [scores.conll_precision, scores.conll_recall, scores.conll_f1],
             ]):
                 data.append([h] + [f'{x:.2%}' for x in [p, r, f]])
             table = markdown_table(headings, data)
             logger.info(f'{"Scores": ^{len(table.splitlines()[0])}}')
             logger.info(table)
     else:
         scores = metric
     return total_loss / timer.total, scores
Beispiel #4
0
 def fit_dataloader(self,
                    trn: DataLoader,
                    criterion,
                    optimizer,
                    metric,
                    logger: logging.Logger,
                    history: History,
                    gradient_accumulation=1,
                    grad_norm=None,
                    ratio_width=None,
                    eval_trn=False,
                    **kwargs):
     optimizer, scheduler = optimizer
     self.model.train()
     timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
     total_loss = 0
     for idx, batch in enumerate(trn):
         pred, mask = self.feed_batch(batch)
         loss = self.compute_loss(criterion, pred, batch['srl_id'], mask)
         if gradient_accumulation and gradient_accumulation > 1:
             loss /= gradient_accumulation
         loss.backward()
         total_loss += loss.item()
         if eval_trn:
             prediction = self.decode_output(pred, mask, batch)
             self.update_metrics(metric, prediction, batch)
         if history.step(gradient_accumulation):
             self._step(optimizer, scheduler, grad_norm)
             report = f'loss: {total_loss / (idx + 1):.4f} {metric}' if eval_trn else f'loss: {total_loss / (idx + 1):.4f}'
             timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
         del loss
         del pred
         del mask
Beispiel #5
0
 def fit_dataloader(self,
                    trn: DataLoader,
                    criterion,
                    optimizer,
                    metric,
                    logger: logging.Logger,
                    linear_scheduler=None,
                    gradient_accumulation=1,
                    **kwargs):
     self.model.train()
     timer = CountdownTimer(len(trn) // gradient_accumulation)
     total_loss = 0
     self.reset_metrics(metric)
     for idx, batch in enumerate(trn):
         output_dict = self.feed_batch(batch)
         self.update_metrics(batch, output_dict, metric)
         loss = output_dict['loss']
         loss = loss.sum()  # For data parallel
         if torch.isnan(loss):  # w/ gold pred, some batches do not have PAs at all, resulting in empty scores
             loss = torch.zeros((1,), device=loss.device)
         else:
             loss.backward()
         if gradient_accumulation and gradient_accumulation > 1:
             loss /= gradient_accumulation
         if self.config.grad_norm:
             torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
         if (idx + 1) % gradient_accumulation == 0:
             self._step(optimizer, linear_scheduler)
             timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                       logger=logger)
         total_loss += loss.item()
         del loss
     if len(trn) % gradient_accumulation:
         self._step(optimizer, linear_scheduler)
     return total_loss / timer.total
 def evaluate_dataloader(self,
                         data,
                         criterion,
                         logger=None,
                         ratio_width=None,
                         metric=None,
                         output=None,
                         **kwargs):
     self.model.eval()
     total_loss = 0
     if not metric:
         metric = self.build_metric()
     else:
         metric.reset()
     timer = CountdownTimer(len(data))
     for idx, batch in enumerate(data):
         out, mask = self.feed_batch(batch)
         y = batch['chart_id']
         loss, span_probs = self.compute_loss(out, y, mask)
         total_loss += loss.item()
         prediction = self.decode_output(out, mask, batch, span_probs)
         self.update_metrics(metric, batch, prediction)
         timer.log(f'loss: {total_loss / (idx + 1):.4f} {metric}',
                   ratio_percentage=False,
                   logger=logger,
                   ratio_width=ratio_width)
     total_loss /= len(data)
     if output:
         output.close()
     return total_loss, metric
Beispiel #7
0
    def build_dataloader(self, data, batch_size,
                         gradient_accumulation=1,
                         shuffle=False,
                         sampler_builder: SamplerBuilder = None,
                         device=None,
                         logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        dataset = self.build_dataset(data, not shuffle)
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        self.finalize_dataset(dataset, logger)
        if isinstance(data, str):
            dataset.purge_cache()
            timer = CountdownTimer(len(dataset))
            max_num_tokens = 0
            # lc = Counter()
            for each in dataset:
                max_num_tokens = max(max_num_tokens, len(each['text_token_ids']))
                # lc[len(each['text_token_ids'])] += 1
                timer.log(f'Preprocessing and caching samples (longest sequence {max_num_tokens})'
                          f'[blink][yellow]...[/yellow][/blink]')
            # print(lc.most_common())
            if self.vocabs.mutable:
                self.vocabs.lock()
                self.vocabs.summary(logger)

        if not sampler_builder:
            sampler_builder = SortingSamplerBuilder(batch_max_tokens=500)
        sampler = sampler_builder.build([len(x['text_token_ids']) for x in dataset], shuffle,
                                        gradient_accumulation if dataset.cache else 1)
        return self._create_dataloader(dataset, batch_size, device, sampler, shuffle)
Beispiel #8
0
 def _build_cache(self, dataset, verbose=HANLP_VERBOSE):
     timer = CountdownTimer(self.size)
     with open(self.filename, "wb") as f:
         for i, batch in enumerate(dataset):
             torch.save(batch, f, _use_new_zipfile_serialization=False)
             if verbose:
                 timer.log(f'Caching {self.filename} [blink][yellow]...[/yellow][/blink]')
Beispiel #9
0
 def fit_dataloader(self,
                    trn: DataLoader,
                    criterion,
                    optimizer,
                    metric,
                    logger: logging.Logger,
                    linear_scheduler=None,
                    **kwargs):
     self.model.train()
     timer = CountdownTimer(len(trn))
     total_loss = 0
     self.reset_metrics()
     for batch in trn:
         optimizer.zero_grad()
         output_dict = self.feed_batch(batch)
         loss = output_dict['loss']
         loss.backward()
         if self.config.grad_norm:
             clip_grad_norm(self.model, self.config.grad_norm)
         optimizer.step()
         if linear_scheduler:
             linear_scheduler.step()
         total_loss += loss.item()
         timer.log(self.report_metrics(total_loss / (timer.current + 1)),
                   ratio_percentage=None,
                   logger=logger)
         del loss
     return total_loss / timer.total
Beispiel #10
0
    def evaluate_dataloader(self,
                            data,
                            criterion,
                            logger=None,
                            ratio_width=None,
                            metric=None,
                            output=None,
                            **kwargs):
        self.model.eval()
        if isinstance(output, str):
            output = open(output, 'w')

        loss = 0
        if not metric:
            metric = self.build_metric()
        else:
            metric.reset()
        timer = CountdownTimer(len(data))
        for idx, batch in enumerate(data):
            logits, mask = self.feed_batch(batch)
            y = batch['tag_id']
            loss += self.compute_loss(criterion, logits, y, mask).item()
            prediction = self.decode_output(logits, mask, batch)
            self.update_metrics(metric, logits, y, mask, batch, prediction)
            if output:
                self.write_prediction(prediction, batch, output)
            timer.log(f'loss: {loss / (idx + 1):.4f} {metric}',
                      ratio_percentage=False,
                      logger=logger,
                      ratio_width=ratio_width)
        loss /= len(data)
        if output:
            output.close()
        return float(loss), metric
Beispiel #11
0
 def execute_training_loop(self,
                           trn: DataLoader,
                           dev: DataLoader,
                           epochs,
                           criterion,
                           optimizer,
                           metric,
                           save_dir,
                           logger: logging.Logger,
                           devices,
                           patience=0.5,
                           **kwargs):
     if isinstance(patience, float):
         patience = int(patience * epochs)
     best_epoch, best_metric = 0, -1
     timer = CountdownTimer(epochs)
     ratio_width = len(f'{len(trn)}/{len(trn)}')
     epoch = 0
     history = History()
     for epoch in range(1, epochs + 1):
         logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
         self.fit_dataloader(trn,
                             criterion,
                             optimizer,
                             metric,
                             logger,
                             history,
                             ratio_width=ratio_width,
                             **self.config)
         if dev:
             self.evaluate_dataloader(dev,
                                      criterion,
                                      metric,
                                      logger,
                                      ratio_width=ratio_width,
                                      input='dev')
         report = f'{timer.elapsed_human}/{timer.total_time_human}'
         dev_score = metric.score
         if dev_score > best_metric:
             self.save_weights(save_dir)
             best_metric = dev_score
             best_epoch = epoch
             report += ' [red]saved[/red]'
         else:
             report += f' ({epoch - best_epoch})'
             if epoch - best_epoch >= patience:
                 report += ' early stop'
                 break
         timer.log(report,
                   ratio_percentage=False,
                   newline=True,
                   ratio=False)
     for d in [trn, dev]:
         self._close_dataloader(d)
     if best_epoch != epoch:
         logger.info(
             f'Restoring best model saved [red]{epoch - best_epoch}[/red] epochs ago'
         )
         self.load_weights(save_dir)
     return best_metric
Beispiel #12
0
 def execute_training_loop(self,
                           trn: DataLoader,
                           dev: DataLoader,
                           epochs,
                           criterion,
                           optimizer,
                           metric,
                           save_dir,
                           logger: logging.Logger,
                           devices,
                           **kwargs):
     best_epoch, best_score = 0, -1
     optimizer, scheduler = optimizer
     timer = CountdownTimer(epochs)
     _len_trn = len(trn) // self.config.gradient_accumulation
     ratio_width = len(f'{_len_trn}/{_len_trn}')
     history = History()
     for epoch in range(1, epochs + 1):
         logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
         self.fit_dataloader(trn, criterion, optimizer, metric, logger, history,
                             linear_scheduler=scheduler if self.use_transformer else None, **kwargs)
         if dev:
             metric = self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
         report = f'{timer.elapsed_human}/{timer.total_time_human}'
         dev_score = sum(x.score for x in metric) / len(metric)
         if not self.use_transformer:
             scheduler.step(dev_score)
         if dev_score > best_score:
             self.save_weights(save_dir)
             best_score = dev_score
             report += ' [red]saved[/red]'
         timer.log(report, ratio_percentage=False, newline=True, ratio=False)
Beispiel #13
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     transform.insert(0, append_bos)
     dataset = BiaffineDependencyParser.build_dataset(self, data, transform)
     if isinstance(data, str):
         dataset.purge_cache()
     if self.vocabs.mutable:
         BiaffineDependencyParser.build_vocabs(self,
                                               dataset,
                                               logger,
                                               transformer=True)
     if dataset.cache:
         timer = CountdownTimer(len(dataset))
         BiaffineDependencyParser.cache_dataset(self, dataset, timer,
                                                training, logger)
     max_seq_len = self.config.get('max_seq_len', None)
     if max_seq_len and isinstance(data, str):
         dataset.prune(lambda x: len(x['token_input_ids']) > 510, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset, length_field='FORM'),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset,
                                  pad=self.get_pad_dict())
Beispiel #14
0
 def fit_dataloader(self,
                    trn: DataLoader,
                    criterion,
                    optimizer,
                    metric,
                    logger: logging.Logger,
                    ratio_width=None,
                    **kwargs):
     self.model.train()
     timer = CountdownTimer(len(trn))
     total_loss = 0
     for idx, batch in enumerate(trn):
         optimizer.zero_grad()
         out, mask = self.feed_batch(batch)
         y = batch['tag_id']
         loss = self.compute_loss(criterion, out, y, mask)
         loss.backward()
         nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
         optimizer.step()
         total_loss += loss.item()
         prediction = self.decode_output(out, mask, batch)
         self.update_metrics(metric, out, y, mask, batch, prediction)
         timer.log(f'loss: {loss / (idx + 1):.4f} {metric}',
                   ratio_percentage=False,
                   logger=logger,
                   ratio_width=ratio_width)
         del loss
         del out
         del mask
Beispiel #15
0
    def build_vocabs(self, dataset, logger=None, transformer=False):
        rel_vocab = self.vocabs.get('rel', None)
        if rel_vocab is None:
            rel_vocab = Vocab(unk_token=None,
                              pad_token=self.config.get('pad_rel', None))
            self.vocabs.put(rel=rel_vocab)

        timer = CountdownTimer(len(dataset))
        if transformer:
            token_vocab = None
        else:
            self.vocabs.token = token_vocab = VocabCounter(
                unk_token=self.config.get('unk', UNK))
        for i, sample in enumerate(dataset):
            timer.log('Building vocab [blink][yellow]...[/yellow][/blink]',
                      ratio_percentage=True)
        min_freq = self.config.get('min_freq', None)
        if min_freq:
            token_vocab.trim(min_freq)
        rel_vocab.set_unk_as_safe_unk()  # Some relation in dev set is OOV
        self.vocabs.lock()
        self.vocabs.summary(logger=logger)
        if token_vocab:
            self.config.n_words = len(self.vocabs['token'])
        self.config.n_rels = len(self.vocabs['rel'])
        if token_vocab:
            self.config.pad_index = self.vocabs['token'].pad_idx
            self.config.unk_index = self.vocabs['token'].unk_idx
Beispiel #16
0
 def fit_dataloader(self,
                    trn: DataLoader,
                    criterion,
                    optimizer,
                    metric,
                    logger: logging.Logger,
                    history: History,
                    ratio_width=None,
                    gradient_accumulation=1,
                    encoder_grad_norm=None,
                    decoder_grad_norm=None,
                    patience=0.5,
                    eval_trn=False,
                    **kwargs):
     self.model.train()
     encoder_optimizer, encoder_scheduler, decoder_optimizers = optimizer
     timer = CountdownTimer(len(trn))
     total_loss = 0
     self.reset_metrics(metric)
     model = self.model_
     encoder_parameters = model.encoder.parameters()
     decoder_parameters = model.decoders.parameters()
     for idx, (task_name, batch) in enumerate(trn):
         decoder_optimizer = decoder_optimizers.get(task_name, None)
         output_dict, _ = self.feed_batch(batch, task_name)
         loss = self.compute_loss(batch, output_dict[task_name]['output'], criterion[task_name],
                                  self.tasks[task_name])
         if gradient_accumulation and gradient_accumulation > 1:
             loss /= gradient_accumulation
         loss.backward()
         total_loss += float(loss.item())
         if history.step(gradient_accumulation):
             if self.config.get('grad_norm', None):
                 clip_grad_norm(model, self.config.grad_norm)
             if encoder_grad_norm:
                 torch.nn.utils.clip_grad_norm_(encoder_parameters, encoder_grad_norm)
             if decoder_grad_norm:
                 torch.nn.utils.clip_grad_norm_(decoder_parameters, decoder_grad_norm)
             encoder_optimizer.step()
             encoder_optimizer.zero_grad()
             encoder_scheduler.step()
             if decoder_optimizer:
                 if isinstance(decoder_optimizer, tuple):
                     decoder_optimizer, decoder_scheduler = decoder_optimizer
                 else:
                     decoder_scheduler = None
                 decoder_optimizer.step()
                 decoder_optimizer.zero_grad()
                 if decoder_scheduler:
                     decoder_scheduler.step()
         if eval_trn:
             self.decode_output(output_dict, batch, task_name)
             self.update_metrics(batch, output_dict, metric, task_name)
         timer.log(self.report_metrics(total_loss / (timer.current + 1), metric if eval_trn else None),
                   ratio_percentage=None,
                   ratio_width=ratio_width,
                   logger=logger)
         del loss
         del output_dict
     return total_loss / timer.total
Beispiel #17
0
 def load_file(self, filepath: str):
     filepath = get_resource(filepath)
     if os.path.isfile(filepath):
         files = [filepath]
     else:
         assert os.path.isdir(
             filepath), f'{filepath} has to be a directory of CoNLL 2012'
         files = sorted(
             glob.glob(f'{filepath}/**/*gold_conll', recursive=True))
     timer = CountdownTimer(len(files))
     for fid, f in enumerate(files):
         timer.log(f'files loading[blink][yellow]...[/yellow][/blink]')
         # 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF
         for sent in read_tsv_as_sents(f, ignore_prefix='#'):
             sense = [cell[7] for cell in sent]
             props = [cell[11:-1] for cell in sent]
             props = map(lambda p: p, zip(*props))
             prd_bio_labels = [
                 self._make_bio_labels(prop) for prop in props
             ]
             prd_bio_labels = [self._remove_B_V(x) for x in prd_bio_labels]
             prd_indices = [i for i, x in enumerate(sense) if x != '-']
             token = [x[3] for x in sent]
             srl = [None for x in token]
             for idx, labels in zip(prd_indices, prd_bio_labels):
                 srl[idx] = labels
             srl = [x if x else ['O'] * len(token) for x in srl]
             yield {'token': token, 'srl': srl}
Beispiel #18
0
 def build_dataloader(self,
                      data,
                      batch_size,
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      verbose=False,
                      **kwargs) -> DataLoader:
     dataset = MaskedLanguageModelDataset(
         [{
             'token': x
         } for x in data],
         generate_idx=True,
         transform=TransformerTextTokenizer(self.tokenizer,
                                            text_a_key='token'))
     if verbose:
         verbose = CountdownTimer(len(dataset))
     lens = []
     for each in dataset:
         lens.append(len(each['token_input_ids']))
         if verbose:
             verbose.log(
                 'Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]'
             )
     dataloader = PadSequenceDataLoader(dataset,
                                        batch_sampler=SortingSampler(
                                            lens, batch_size=batch_size),
                                        device=device)
     return dataloader
Beispiel #19
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     if isinstance(data, list):
         data = BiaffineSemanticDependencyParser.build_samples(
             self, data, self.config.use_pos)
     dataset = BiaffineSemanticDependencyParser.build_dataset(
         self, data, transform)
     if isinstance(data, str):
         dataset.purge_cache()
     if self.vocabs.mutable:
         BiaffineSemanticDependencyParser.build_vocabs(self,
                                                       dataset,
                                                       logger,
                                                       transformer=True)
     if dataset.cache:
         timer = CountdownTimer(len(dataset))
         BiaffineSemanticDependencyParser.cache_dataset(
             self, dataset, timer, training, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset,
                                  pad=self.get_pad_dict())
 def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs,
                           criterion, optimizer, metric, save_dir,
                           logger: logging.Logger, devices, **kwargs):
     best_epoch, best_metric = 0, -1
     timer = CountdownTimer(epochs)
     ratio_width = len(f'{len(trn)}/{len(trn)}')
     for epoch in range(1, epochs + 1):
         logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
         self.fit_dataloader(trn, criterion, optimizer, metric, logger)
         if dev:
             self.evaluate_dataloader(dev,
                                      criterion,
                                      metric,
                                      logger,
                                      ratio_width=ratio_width)
         report = f'{timer.elapsed_human}/{timer.total_time_human}'
         dev_score = metric.get_metric()
         if dev_score > best_metric:
             self.save_weights(save_dir)
             best_metric = dev_score
             report += ' [red]saved[/red]'
         timer.log(report,
                   ratio_percentage=False,
                   newline=True,
                   ratio=False)
Beispiel #21
0
    def compute_lens(self,
                     data: Union[List[Dict[str, Any]], str],
                     dataset: TransformableDataset,
                     input_ids='token_input_ids'):
        """

        Args:
            data: Samples to be measured or path to dataset during training time.
            dataset: During training time, use this dataset to measure the length of each sample inside.
            input_ids: Field name corresponds to input ids.

        Returns:

            Length list of this samples

        """
        if not dataset.cache:
            warnings.warn(
                f'Caching for the dataset is not enabled, '
                f'try `dataset.purge_cache()` if possible. The dataset is {dataset}.'
            )
        if isinstance(data, str):
            timer = CountdownTimer(len(dataset))
            for each in dataset:
                timer.log(
                    'Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]'
                )
            timer.erase()
        return [len(x[input_ids]) for x in dataset]
Beispiel #22
0
    def evaluate_dataloader(self,
                            loader: PadSequenceDataLoader,
                            criterion,
                            logger=None,
                            filename=None,
                            output=False,
                            ratio_width=None,
                            metric=None,
                            **kwargs):
        self.model.eval()

        total_loss = 0
        if not metric:
            metric = self.build_metric()

        timer = CountdownTimer(len(loader))
        for batch in loader:
            (s_arc, s_sib, s_rel), mask, puncts = self.feed_batch(batch)
            arcs, sibs, rels = batch['arc'], batch['sib_id'], batch['rel_id']
            loss, s_arc = self.compute_loss(s_arc, s_sib, s_rel, arcs, sibs,
                                            rels, mask)
            total_loss += float(loss)
            arc_preds, rel_preds = self.decode(s_arc, s_sib, s_rel, mask)
            self.update_metric(arc_preds, rel_preds, arcs, rels, mask, puncts,
                               metric)
            report = self._report(total_loss / (timer.current + 1), metric)
            if filename:
                report = f'{os.path.basename(filename)} ' + report
            timer.log(report,
                      ratio_percentage=False,
                      logger=logger,
                      ratio_width=ratio_width)
        total_loss /= len(loader)

        return total_loss, metric
Beispiel #23
0
 def fit_dataloader(self,
                    trn: DataLoader,
                    criterion,
                    optimizer,
                    metric,
                    logger: logging.Logger,
                    history: History,
                    linear_scheduler=None,
                    gradient_accumulation=1,
                    **kwargs):
     self.model.train()
     timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation))
     total_loss = 0
     self.reset_metrics(metric)
     for idx, batch in enumerate(trn):
         output_dict = self.feed_batch(batch)
         self.update_metrics(batch, output_dict, metric)
         loss = output_dict['loss']
         if gradient_accumulation > 1:
             loss /= gradient_accumulation
         loss.backward()
         total_loss += loss.item()
         if history.step(gradient_accumulation):
             self._step(optimizer, linear_scheduler)
             timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                       logger=logger)
         del loss
     return total_loss / timer.total
Beispiel #24
0
    def evaluate_dataloader(self,
                            data: MultiTaskDataLoader,
                            criterion,
                            metric: MetricDict,
                            logger,
                            ratio_width=None,
                            input: str = None,
                            **kwargs):
        self.model.eval()
        self.reset_metrics(metric)
        tasks_need_custom_eval = self.config.get('tasks_need_custom_eval',
                                                 None)
        tasks_need_custom_eval = tasks_need_custom_eval or {}
        tasks_need_custom_eval = dict(
            (k, None) for k in tasks_need_custom_eval)
        for each in tasks_need_custom_eval:
            tasks_need_custom_eval[each] = data.dataloaders.pop(each)
        timer = CountdownTimer(len(data) + len(tasks_need_custom_eval))
        total_loss = 0
        for idx, (task_name, batch) in enumerate(data):
            output_dict, _ = self.feed_batch(batch, task_name)
            loss = self.compute_loss(batch, output_dict[task_name]['output'],
                                     criterion[task_name],
                                     self.tasks[task_name])
            total_loss += loss.item()
            self.decode_output(output_dict, batch, task_name)
            self.update_metrics(batch, output_dict, metric, task_name)
            timer.log(self.report_metrics(total_loss / (timer.current + 1),
                                          metric),
                      ratio_percentage=None,
                      logger=logger,
                      ratio_width=ratio_width)
            del loss
            del output_dict

        for task_name, dataset in tasks_need_custom_eval.items():
            task = self.tasks[task_name]
            decoder = self.model_.decoders[task_name]
            task.evaluate_dataloader(
                dataset,
                task.build_criterion(decoder=decoder),
                metric=metric[task_name],
                input=task.dev if input == 'dev' else task.tst,
                split=input,
                decoder=decoder,
                h=functools.partial(self._encode,
                                    task_name=task_name,
                                    cls_is_bos=task.cls_is_bos,
                                    sep_is_eos=task.sep_is_eos))
            data.dataloaders[task_name] = dataset
            timer.log(self.report_metrics(total_loss / (timer.current + 1),
                                          metric),
                      ratio_percentage=None,
                      logger=logger,
                      ratio_width=ratio_width)

        return total_loss / timer.total, metric, data
Beispiel #25
0
 def fit_dataloader(self,
                    trn: DataLoader,
                    criterion,
                    optimizer,
                    metric,
                    logger: logging.Logger,
                    history: History,
                    gradient_accumulation=1,
                    grad_norm=None,
                    transformer_grad_norm=None,
                    teacher: Tagger = None,
                    kd_criterion=None,
                    temperature_scheduler=None,
                    ratio_width=None,
                    eval_trn=True,
                    **kwargs):
     optimizer, scheduler = optimizer
     if teacher:
         scheduler, lambda_scheduler = scheduler
     else:
         lambda_scheduler = None
     self.model.train()
     timer = CountdownTimer(
         history.num_training_steps(
             len(trn), gradient_accumulation=gradient_accumulation))
     total_loss = 0
     for idx, batch in enumerate(trn):
         out, mask = self.feed_batch(batch)
         y = batch['tag_id']
         loss = self.compute_loss(criterion, out, y, mask)
         if gradient_accumulation and gradient_accumulation > 1:
             loss /= gradient_accumulation
         if teacher:
             with torch.no_grad():
                 out_T, _ = teacher.feed_batch(batch)
             # noinspection PyNoneFunctionAssignment
             kd_loss = self.compute_distill_loss(kd_criterion, out, out_T,
                                                 mask,
                                                 temperature_scheduler)
             _lambda = float(lambda_scheduler)
             loss = _lambda * loss + (1 - _lambda) * kd_loss
         loss.backward()
         total_loss += loss.item()
         if eval_trn:
             prediction = self.decode_output(out, mask, batch)
             self.update_metrics(metric, out, y, mask, batch, prediction)
         if history.step(gradient_accumulation):
             self._step(optimizer, scheduler, grad_norm,
                        transformer_grad_norm, lambda_scheduler)
             report = f'loss: {total_loss / (idx + 1):.4f} {metric if eval_trn else ""}'
             timer.log(report,
                       logger=logger,
                       ratio_percentage=False,
                       ratio_width=ratio_width)
         del loss
         del out
         del mask
Beispiel #26
0
 def execute_training_loop(self, trn, dev, devices, epochs, logger,
                           patience, save_dir, optimizer,
                           gradient_accumulation, **kwargs):
     optimizer, scheduler, transformer_optimizer, transformer_scheduler = optimizer
     criterion = self.build_criterion()
     best_e, best_metric = 0, self.build_metric()
     timer = CountdownTimer(epochs)
     history = History()
     ratio_width = len(
         f'{len(trn) // gradient_accumulation}/{len(trn) // gradient_accumulation}'
     )
     for epoch in range(1, epochs + 1):
         # train one epoch and update the parameters
         logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
         self.fit_dataloader(trn,
                             optimizer,
                             scheduler,
                             criterion,
                             epoch,
                             logger,
                             history,
                             transformer_optimizer,
                             transformer_scheduler,
                             gradient_accumulation=gradient_accumulation,
                             eval_trn=self.config.eval_trn)
         loss, dev_metric = self.evaluate_dataloader(
             dev, criterion, ratio_width=ratio_width, logger=logger)
         timer.update()
         # logger.info(f"{'Dev' + ' ' * ratio_width} loss: {loss:.4f} {dev_metric}")
         # save the model if it is the best so far
         report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
         if dev_metric > best_metric:
             best_e, best_metric = epoch, dev_metric
             self.save_weights(save_dir)
             report += ' ([red]saved[/red])'
         else:
             if patience != epochs:
                 report += f' ({epoch - best_e}/{patience})'
             else:
                 report += f' ({epoch - best_e})'
         logger.info(report)
         if patience is not None and epoch - best_e >= patience:
             logger.info(
                 f'LAS has stopped improving for {patience} epochs, early stop.'
             )
             break
     timer.stop()
     if not best_e:
         self.save_weights(save_dir)
     elif best_e != epoch:
         self.load_weights(save_dir)
     logger.info(
         f"Max score of dev is {best_metric.score:.2%} at epoch {best_e}")
     logger.info(
         f"Average time of each epoch is {timer.elapsed_average_human}")
     logger.info(f"{timer.elapsed_human} elapsed")
Beispiel #27
0
 def build(self, del_dataloader_in_memory=True, verbose=HANLP_VERBOSE):
     timer = CountdownTimer(self.size)
     for i, batch in enumerate(self.dataset):
         filename = self._filename(i)
         torch.save(batch, filename)
         if verbose:
             timer.log(
                 f'Caching {filename} [blink][yellow]...[/yellow][/blink]')
     if del_dataloader_in_memory:
         del self.dataset
Beispiel #28
0
 def execute_training_loop(self,
                           trn: DataLoader,
                           dev: DataLoader,
                           epochs,
                           criterion,
                           optimizer,
                           metric,
                           save_dir,
                           logger: logging.Logger,
                           devices,
                           ratio_width=None,
                           patience=5,
                           teacher=None,
                           kd_criterion=None,
                           eval_trn=True,
                           **kwargs):
     best_epoch, best_metric = 0, -1
     timer = CountdownTimer(epochs)
     history = History()
     for epoch in range(1, epochs + 1):
         logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
         self.fit_dataloader(trn,
                             criterion,
                             optimizer,
                             metric,
                             logger,
                             history=history,
                             ratio_width=ratio_width,
                             eval_trn=eval_trn,
                             **self.config)
         loss, dev_metric = self.evaluate_dataloader(
             dev, criterion, logger=logger, ratio_width=ratio_width)
         timer.update()
         report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
         if dev_metric > best_metric:
             best_epoch, best_metric = epoch, dev_metric
             self.save_weights(save_dir)
             report += ' [red](saved)[/red]'
         else:
             report += f' ({epoch - best_epoch})'
             if epoch - best_epoch >= patience:
                 report += ' early stop'
         logger.info(report)
         if epoch - best_epoch >= patience:
             break
     if not best_epoch:
         self.save_weights(save_dir)
     elif best_epoch != epoch:
         self.load_weights(save_dir)
     logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
     logger.info(
         f"Average time of each epoch is {timer.elapsed_average_human}")
     logger.info(f"{timer.elapsed_human} elapsed")
     return best_metric
Beispiel #29
0
 def build_vocabs(self, trn, logger, **kwargs):
     self.vocabs.tag = Vocab(pad_token=None, unk_token=None)
     timer = CountdownTimer(len(trn))
     max_seq_len = 0
     token_key = self.config.token_key
     for each in trn:
         max_seq_len = max(max_seq_len, len(each[token_key]))
         timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
     self.vocabs.tag.set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)
Beispiel #30
0
def clean_ctb_bracketed(ctb_root, out_root):
    os.makedirs(out_root, exist_ok=True)
    ctb_root = join(ctb_root, 'bracketed')
    chtbs = _list_treebank_root(ctb_root)
    timer = CountdownTimer(len(chtbs))
    for f in chtbs:
        with open(join(ctb_root, f), encoding='utf-8') as src, open(join(out_root, f + '.txt'), 'w',
                                                                    encoding='utf-8') as out:
            for line in src:
                if not line.strip().startswith('<'):
                    out.write(line)
        timer.log('Cleaning up CTB [blink][yellow]...[/yellow][/blink]', erase=False)