Beispiel #1
0
    def evaluate(self, gold, test=None, feats=None, label=None,
                 use_cdict_coef=False, batch_size=BATCH_SIZE, split=None,
                 clone_ds=False, log_file=LOG_FILE):
        """Evaluate the tagger model.

        Args:

        **gold**: the corpus of sentences with actual target values to score
        the tagger on. May be either the name of the file in *CoNLL-U* format
        or the `list`/`iterator` of sentences in *Parsed CoNLL-U*.

        **test** (default is `None`): the corpus of sentences with predicted
        target values. If `None` (default), the **gold** corpus will be
        retagged on-the-fly, and the result will be used as the **test**.

        **feats** (`str | list([str])`; default is `None`): one or several
        subfields of the key-value type fields like `FEATS` or `MISC` to be
        evaluated separatedly.

        **label** (`str`; default is `None`): the specific label of the target
        field to be evaluated separatedly, e.g. `field='UPOS', label='VERB'`
        or `field='FEATS:Animacy', label='Inan'`.

        **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`,
        we use our prediction only. If `True`, we replace our prediction to
        the value returned by the `corpuscula.CorpusDict.predict_<field>()`
        method if its `coef` >= `.99`. Also, you can specify your own
        threshold as the value of the param.

        **batch_size** (`int`; default is `64`): the number of sentences per
        batch.

        **split** (`int`; default is `None`): the number of lines in sentences
        split. Allows to process a large dataset in pieces ("splits"). If
        **split** is `None` (default), all the dataset is processed without
        splits.

        **clone_ds** (`bool`; default is `False`): if `True`, the dataset is
        cloned and transformed. If `False`, `transform_collate` is used
        without cloning the dataset. There is no big differences between the
        variants. Both should produce identical results.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        The method prints metrics and returns evaluation accuracy.
        """
        assert not label or feats, \
            'ERROR: To evaluate the exact label you must specify its ' \
            'feat, too'
        assert not label or not feats \
                         or isinstance(feats, str) or len(feats) == 1, \
            'ERROR: To evaluate the exact label you must specify its own ' \
            'feat only'
        args, kwargs = get_func_params(FeatsSeparateTagger.evaluate, locals())
        field = self._field
        if label:
            del kwargs['feats']
            field += ':' + (feats if isinstance(feats, str) else feats[0])
        return super().evaluate(field, *args, **kwargs)
Beispiel #2
0
    def load(self,
             name,
             device=None,
             dataset_emb_path=None,
             dataset_device=None,
             log_file=LOG_FILE):
        """Loads tagger's internal state saved by its `.save()` method.

        Args:

        **name** (`str`): name of the previously saved internal state.

        **device**: a device for the loaded model if you want to override
        the value from config.

        **dataset_emb_path**: a path where dataset's embeddings to load from
        if you want to override the value from config.

        **dataset_device**: a device for the loaded dataset if you want to
        override the value from config.

        **log_file**: a stream for info messages. Default is `sys.stdout`.
        """
        args, kwargs = get_func_params(DeprelTagger.load, locals())
        super(self.__class__.__base__, self).load(DeprelTaggerModel, *args,
                                                  **kwargs)
Beispiel #3
0
 def __init__(self, num_labels, labels_pad_idx=-100, vec_emb_dim=None,
              alphabet_size=0, char_pad_idx=0, rnn_emb_dim=None,
              cnn_emb_dim=None, cnn_kernels=[1, 2, 3, 4, 5, 6],
              emb_bn=True, emb_do=.2,
              final_emb_dim=512, pre_bn=True, pre_do=.5,
              lstm_layers=1, lstm_do=0, tran_layers=None, tran_heads=8,
              post_bn=True, post_do=.4):
     if isinstance(cnn_kernels, Iterable):
         cnn_kernels = list(cnn_kernels)
     args, kwargs = get_func_params(UposTaggerModel.__init__, locals())
     super().__init__(*args, **kwargs)
     setattr(self, CONFIG_ATTR, (args, kwargs))
Beispiel #4
0
    def predict(self, corpus, use_cdict_coef=False, with_orig=False,
                batch_size=BATCH_SIZE, split=None, clone_ds=False,
                save_to=None, log_file=LOG_FILE):
        """Predicts tags in the UPOS field of the corpus.

        Args:

        **corpus**: the corpus which will be used for the feature extraction
        and predictions. May be either the name of the file in *CoNLL-U*
        format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*.

        **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`,
        we use our prediction only. If `True`, we replace our prediction to
        the value returned by the `corpuscula.CorpusDict.predict_<field>()`
        method if its `coef` >= `.99`. Also, you can specify your own
        threshold as the value of the param.

        **with_orig** (`bool`; default is `False`): if `True`, instead of just
        the sequence with predicted labels, return the sequence of tuples
        where the first element is the sentence with predicted labels and the
        second element is the original sentence. **with_orig** can be `True`
        only if **save_to** is `None`.

        **batch_size** (`int`; default is `64`): the number of sentences per
        batch.

        **split** (`int`; default is `None`): the number of lines in sentences
        split. Allows to process a large dataset in pieces ("splits"). If
        **split** is `None` (default), all the dataset is processed without
        splits.

        **clone_ds** (`bool`; default is `False`): if `True`, the dataset is
        cloned and transformed. If `False`, `transform_collate` is used
        without cloning the dataset. There is no big differences between the
        variants. Both should produce identical results.

        **save_to** (`str`; default is `None`): the file name where the
        predictions will be saved.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        Returns the corpus with tags predicted in the UPOS field.
        """
        assert self._field == 'UPOS' or use_cdict_coef in [None, False], \
            'ERROR: "use_cdict_coef" param may be used only with UPOS field'
        args, kwargs = get_func_params(UposTagger.predict, locals())
        return super().predict(self._field, None, *args, **kwargs)
Beispiel #5
0
 def __init__(self, num_labels, labels_pad_idx=-100, vec_emb_dim=None,
              alphabet_size=0, char_pad_idx=0, rnn_emb_dim=None,
              cnn_emb_dim=200, cnn_kernels=[1, 2, 3, 4, 5, 6],
              upos_emb_dim=200, upos_num=0, upos_pad_idx=0,
              emb_bn=True, emb_do=.2,
              final_emb_dim=512, pre_bn=True, pre_do=.5,
              lstm_layers=1, lstm_do=0, tran_layers=None, tran_heads=8,
              post_bn=True, post_do=.4):
     if isinstance(cnn_kernels, Iterable):
         cnn_kernels = list(cnn_kernels)
     args, kwargs = get_func_params(FeatTaggerModel.__init__, locals())
     kwargs_ = {x: y for x, y in kwargs.items() if x not in [
         'upos_emb_dim', 'upos_num', 'upos_pad_idx'
     ]}
     if upos_emb_dim:
         kwargs_['tag_emb_params'] = {
             'dim': upos_emb_dim, 'num': upos_num, 'pad_idx': upos_pad_idx
         }
     super().__init__(*args, **kwargs_)
     setattr(self, CONFIG_ATTR, (args, kwargs))
Beispiel #6
0
    def load(self, name, device=None,
             dataset_emb_path=None, dataset_device=None, log_file=LOG_FILE):
        """Loads tagger's internal state saved by its `.save()` method.

        Args:

        **name** (`str`): the name of the previously saved internal state.

        **device** (`str`; default is `None`): the device for the loaded model
        if you want to override the value from the config.

        **dataset_emb_path** (`str`; default is `None`): the path where the
        dataset's embeddings to load from if you want to override the value
        from the config.

        **dataset_device** (`str`; default is `None`): the device for the
        loaded dataset if you want to override the value from the config.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.
        """
        args, kwargs = get_func_params(LemmaTagger.load, locals())
        super().load(FeatTaggerModel, *args, **kwargs)
Beispiel #7
0
    def __init__(self, labels_num, vec_emb_dim=None,
                 alphabet_size=0, char_pad_idx=0, rnn_emb_dim=None,
                 cnn_emb_dim=None, cnn_kernels=[1, 2, 3, 4, 5, 6],
                 tag_emb_params=None, emb_out_dim=512,
                 lstm_bidirectional=True, lstm_hidden_dim=256, lstm_layers=1,
                 lstm_do=0, bn1=True, do1=.2, bn2=True, do2=.5,
                 bn3=True, do3=.4):
        if isinstance(cnn_kernels, Iterable):
            cnn_kernels = list(cnn_kernels)
        args, kwargs = \
            get_func_params(BaseTaggerSequenceModel.__init__, locals())
        super().__init__(*args, **kwargs)

        self.vec_emb_dim = vec_emb_dim

        if rnn_emb_dim:
            self._rnn_emb_l = \
                CharEmbeddingRNN(alphabet_size=alphabet_size,
                                 emb_dim=rnn_emb_dim,
                                 pad_idx=char_pad_idx)
        else:
            self._rnn_emb_l = None
            rnn_emb_dim = 0

        if cnn_emb_dim:
            self._cnn_emb_l = \
                CharEmbeddingCNN(alphabet_size=alphabet_size,
                                 emb_dim=cnn_emb_dim,
                                 pad_idx=char_pad_idx,
                                 kernels=cnn_kernels)
        else:
            self._cnn_emb_l = None
            cnn_emb_dim = 0

        self._tag_emb_l = self._tag_emb_ls = None
        tag_emb_dim = 0
        if tag_emb_params:
            if isinstance(tag_emb_params, dict):
                tag_emb_dim = tag_emb_params['dim'] or 0
                if tag_emb_dim:
                    self._tag_emb_l = \
                        nn.Embedding(tag_emb_params['num'], tag_emb_dim,
                                     padding_idx=tag_emb_params['pad_idx'])
            else:
                self._tag_emb_ls = nn.ModuleList()
                for emb_params in tag_emb_params:
                    tag_emb_dim_ = emb_params['dim']
                    if tag_emb_dim_:
                        tag_emb_dim += tag_emb_dim_
                        self._tag_emb_ls.append(
                            nn.Embedding(emb_params['num'], tag_emb_dim_,
                                         padding_idx=emb_params['pad_idx'])
                        )
                    else:
                        self._tag_emb_ls.append(None)

        self._bn1 = \
            nn.BatchNorm1d(num_features=vec_emb_dim
                                      + rnn_emb_dim
                                      + cnn_emb_dim
                                      + tag_emb_dim) if bn1 else None
        self._do1 = nn.Dropout(p=do1) if do1 else None

        self._emb_fc_l = nn.Linear(
            in_features=vec_emb_dim + rnn_emb_dim + cnn_emb_dim
                                    + tag_emb_dim,
            out_features=emb_out_dim
        )
        self._bn2 = \
            nn.BatchNorm1d(num_features=emb_out_dim) if bn2 else None
        self._do2 = nn.Dropout(p=do2) if do2 else None

        self._lstm_l = nn.LSTM(input_size=emb_out_dim,
                               hidden_size=lstm_hidden_dim,
                               num_layers=lstm_layers, batch_first=True,
                               dropout=lstm_do,
                               bidirectional=lstm_bidirectional)
        if lstm_bidirectional:
            lstm_hidden_dim *= 2
        self._T = nn.Linear(emb_out_dim, lstm_hidden_dim)
        nn.init.constant_(self._T.bias, -1)

        self._bn3 = \
            nn.BatchNorm1d(num_features=lstm_hidden_dim) if bn3 else None
        self._do3 = nn.Dropout(p=do3) if do3 else None

        self._out_l = nn.Linear(in_features=lstm_hidden_dim,
                                out_features=labels_num)

        setattr(self, CONFIG_ATTR, (args, kwargs))
Beispiel #8
0
    def train(self, save_as,
              device=None, control_metric='accuracy', max_epochs=None,
              min_epochs=0, bad_epochs=5, batch_size=TRAIN_BATCH_SIZE,
              max_grad_norm=None, tags_to_remove=None, word_emb_type='bert',
              word_emb_path='xlm-roberta-base', word_transform_kwargs=None,
                  # BertDataset.transform() (for BERT-descendant models)
                  # params:
                  # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                  #  'aggregate_hiddens_op': 'cat',
                  #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                  #  'loglevel': 1}
                  # WordDataset.transform() (for other models) params:
                  # {'check_lower': True}
              stage1_params=None,
                  # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                  #  'weight_decay': 0, 'amsgrad': False,
                  #  'max_epochs': None, 'min_epochs': None,
                  #  'bad_epochs': None, 'batch_size': None,
                  #  'max_grad_norm': None}
              stage2_params=None,
                  # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                  #  'dampening': 0, 'nesterov': False,
                  #  'max_epochs': None, 'min_epochs': None,
                  #  'bad_epochs': None, 'batch_size': None,
                  #  'max_grad_norm': None}
              stage3_params={'save_as': None},
                  # {'save_as': None, 'epochs': 3, 'batch_size': 8,
                  #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                  #  'weight_decay': .01, 'amsgrad': False,
                  #  'num_warmup_steps': 3, 'max_grad_norm': 1.}
              stages=[1, 2, 3, 1, 2], save_stages=False, load_from=None,
              learn_on_padding=True, remove_padding_intent=False,
              seed=None, start_time=None, keep_embs=False, log_file=LOG_FILE,
              rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
              emb_bn=True, emb_do=.2,
              final_emb_dim=512, pre_bn=True, pre_do=.5,
              lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
              post_bn=True, post_do=.4):
        """Creates and trains the UPOS tagger model.

        During training, the best model is saved after each successful epoch.

        *Training's args*:

        **save_as** (`str`): the name using for save the model's head. Refer
        to the `.save()` method's help for the broad definition (see the
        **name** arg there).

        **device** (`str`, default is `None`): the device for the model. E.g.:
        'cuda:0'. If `None`, we don't move the model to any device (it is
        placed right where it's created).

        **control_metric** (`str`; default is `accuracy`): the metric that
        control training. Any that is supported by the `junky.train()` method.
        In the moment, it is: 'accuracy', 'f1', 'loss', 'precision', and
        'recall'.

        **max_epochs** (`int`; default is `None`): the maximal number of
        epochs for the model's head training (stages types `1` and `2`). If
        `None` (default), the training would be linger until **bad_epochs**
        has met, but no less than **min_epochs**.

        **min_epochs** (`int`; default is `0`): the minimal number of training
        epochs for the model's head training (stages types `1` and `2`).

        **bad_epochs** (`int`; default is `5`): the maximal allowed number of
        bad epochs (epochs when chosen **control_metric** is not became
        better) in a row for the model's head training (stages types `1` and
        `2`).

        **batch_size** (`int`; default is `32`): the number of sentences per
        batch for the model's head training (stages types `1` and `2`).

        **max_grad_norm** (`float`; default is `None`): the gradient clipping
        parameter for the model's head training (stages types `1` and `2`).

        **tags_to_remove** (`dict({str: str}) | dict({str: list([str])})`;
        default is `None`): the tags, tokens with those must be removed from
        the corpus. It's the `dict` with field names as keys and values you
        want to remove. Applied only to fields with atomic values (like
        UPOS). This argument may be used, for example, to remove some
        infrequent or just excess tags from the corpus. Note, that we remove
        the tokens from the train corpus completely, not just replace those
        tags to `None`.

        *Word embedding params*:

        **word_emb_type** (`str`; default is `'bert'`): one of (`'bert'` |
        `'glove'` | `'ft'` | `'w2v'`) embedding types.

        **word_emb_path** (`str`): the path to the word embeddings storage.

        **word_transform_kwargs** (`dict`; default is `None`): keyword
        arguments for the `.transform()` method of the dataset created for
        sentences to word embeddings conversion. See the `.transform()` method
        of either `junky.datasets.BertDataset` (if **word_emb_path** is
        `'bert'`) or `junky.datasets.WordDataset` (otherwise) if you want to
        learn allowed values for the parameter. If `None`, the `.transform()`
        method use its defaults.

        *Training stages params*:

        **stage1_param** (`dict`; default is `None`): keyword arguments for
        the `BaseModel.adjust_model_for_train()` method. If `None`, the
        defaults are used. Also, you can specify here new values for the
        arguments **max_epochs**, **min_epochs**, **bad_epochs**,
        **batch_size**, and **max_grad_norm** that will be used only on stages
        of type `1`.

        **stage2_param** (`dict`; default is `None`): keyword arguments for
        the `BaseModel.adjust_model_for_tune()` method. If `None`, the
        defaults are used. Also, you can specify here new values for the
        arguments **max_epochs**, **min_epochs**, **bad_epochs**,
        **batch_size**, and **max_grad_norm** that will be used only on stages
        of type `2`.

        **stage3_param** (`dict`; default is `None`): keyword arguments for
        the `WordEmbeddings.full_tune()` method. If `None`, the defaults are
        used.

        **stages** (`list([int]`; default is `[1, 2, 3, 1, 2]`): what stages
        we should use during training and in which order. On the stage type
        `1` the model head is trained with *Adam* optimizer; the stage type
        `2` is similar, but the optimizer is *SGD*; the stage type `3` is only
        relevant when **word_emb_type** is `'bert'` and we want to tune the
        whole model. Stage type `0` defines the skip-stage, i.e. there would
        be no real training on it. It is used when you need reproducibility
        and want to continue train the model from some particular stage. In
        this case, you specify the name of the model saved on that stage in
        the parametere **load_from**, and put zeros into the **stages** list
        on the places of already finished ones. One more time: it is used for
        reproducibility only, i.e. when you put some particular value to the
        **seed** param and want the data order in bathes be equivalent with
        data on the stages from the past trainings.

        **save_stages** (`bool`; default is `False`): if we need to keep the
        best model of each stage beside of the overall best model. The names
        of these models would have the suffix `_<idx>(stage<stage_type>)`
        where `<idx>` is an ordinal number of the stage. We can then use it to
        continue training from any particular stage number (changing next
        stages or their parameters) using the parameter **load_from**. Note
        that we save only stages of the head model. The embedding model as a
        part of the full model usually tune only once, so we don't make its
        copy.

        **load_from** (`str`; default is `None`): if you want to continue
        training from one of previously saved stages, you can specify the name
        of the model from that stage. Note, that if your model is already
        trained on stage type `3`, then you want to set param
        **word_emb_path** to `None`. Otherwise, you'll load wrong embedding
        model. Any other params of the model may be overwritten (and most
        likely, this would cause error), but they are equivalent when the
        training is just starts and when it's continues. But the
        **word_emb_path** is different if you already passed stage type `3`,
        so don't forget to set it to `None` in that case. (Example: you want
        to repeat training on stage no `5`, so you specify in the
        **load_from** param something like `'model_4(stage1)'` and set the
        **word_emb_path** to `None` and the **stages_param** to
        `'[0, 0, 0, 0, 2]'` (or, if you don't care of reproducibility, you
        could just specify `[2]` here).

        *Other options*:

        **learn_on_padding** (`bool`; default is `True`): while training, we
        can calculate loss either taking in account predictions made for
        padding tokens or without it. The common practice is don't use padding
        when calculate loss. However, we note that using padding sometimes
        makes the resulting model performance slightly better.

        **remove_padding_intent** (`bool`; default is `False`): if you set
        **learn_on_padding** param to `False`, you may want not to use padding
        intent during training at all. I.e. padding tokens would be tagged
        with some of real tags, and they would just ignored during computing
        loss. As a result, the model would have the output dimensionality of
        the final layer less by one. On the first sight, such approach could
        increase the performance, but in our experiments, such effect appeared
        not always.

        **seed** (`int`; default is `None`): init value for the random number
        generator if you need reproducibility. Note that each stage will have
        its own seed value, and the **seed** param is used to calculate these
        values.

        **start_time** (`float`; default is `None`): the result of
        `time.time()` to start with. If `None`, the arg will be init anew.

        **keep_embs** (`bool`; default is `False`): by default, after creating
        `Dataset` objects, we remove word embedding models to free memory.
        With `keep_embs=False` this operation is omitted, and you can use
        `.embs` attribute for share embedding models with other objects.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        *The model hyperparameters*:

        **rnn_emb_dim** (`int`; default is `None`): the internal character RNN
        (LSTM) embedding dimensionality. If `None`, the layer is skipped.

        **cnn_emb_dim** (`int`; default is `None`): the internal character CNN
        embedding dimensionality. If `None`, the layer is skipped.

        **cnn_kernels** (`list([int])`; default is `[1, 2, 3, 4, 5, 6]`): CNN
        kernel sizes of the internal CNN embedding layer. Relevant if
        **cnn_emb_dim** is not `None`.

        **emb_bn** (`bool`; default is `True`): whether batch normalization
        layer should be applied after the embedding concatenation.

        **emb_do** (`float`; default is `.2`): the dropout rate after the
        embedding concatenation.

        **final_emb_dim** (`int`; default is `512`): the output dimesionality
        of the linear transformation applying to concatenated embeddings.

        **pre_bn** (`bool`; default is `True`): whether batch normalization
        layer should be applied before the main part of the algorithm.

        **pre_do** (`float`; default is `.5`): the dropout rate before the
        main part of the algorithm.

        **lstm_layers** (`int`; default is `1`): the number of Bidirectional
        LSTM layers. If `None`, they are not created.

        **lstm_do** (`float`; default is `0`): the dropout between LSTM
        layers. Only relevant, if `lstm_layers` > `1`.

        **tran_layers** (`int`; default is `None`): the number of Transformer
        Encoder layers. If `None`, they are not created.

        **tran_heads** (`int`; default is `8`): the number of attention heads
        of Transformer Encoder layers. Only relevant, if `tran_layers` > `1`.

        **post_bn** (`bool`; default is `True`): whether batch normalization
        layer should be applied after the main part of the algorithm.

        **post_do** (`float`; default is `.4`): the dropout rate after the
        main part of the algorithm.

        The method returns the train statistics.
        """
        if not start_time:
            start_time = time.time()
        args, kwargs = get_func_params(UposTagger.train, locals())

        self._cdict = CorpusDict(
            corpus=(x for x in [self._train_corpus,
                                self._test_corpus if self._test_corpus else
                                []]
                      for x in x),
            format='conllu_parsed', log_file=log_file
        )

        return super().train(self._field, None, UposTaggerModel, None,
                             *args, **kwargs)
Beispiel #9
0
    def predict(self,
                corpus,
                use_cdict_coef=False,
                with_orig=False,
                batch_size=BATCH_SIZE,
                split=None,
                clone_ds=False,
                save_to=None,
                log_file=LOG_FILE):
        """Predicts tags in the DEPREL field of the corpus.

        Args:

        **corpus**: the corpus which will be used for the feature extraction
        and predictions. May be either the name of the file in *CoNLL-U*
        format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*.

        **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`,
        we use our prediction only. If `True`, we replace our prediction to
        the value returned by the `corpuscula.CorpusDict.predict_<field>()`
        method if its `coef` >= `.99`. Also, you can specify your own
        threshold as the value of the param.

        **with_orig** (`bool`; default is `False`): if `True`, instead of just
        the sequence with predicted labels, return the sequence of tuples
        where the first element is the sentence with predicted labels and the
        second element is the original sentence. **with_orig** can be `True`
        only if **save_to** is `None`.

        **batch_size** (`int`; default is `64`): the number of sentences per
        batch.

        **split** (`int`; default is `None`): the number of lines in sentences
        split. Allows to process a large dataset in pieces ("splits"). If
        **split** is `None` (default), all the dataset is processed without
        splits.

        **clone_ds** (`bool`; default is `False`): if `True`, the dataset is
        cloned and transformed. If `False`, `transform_collate` is used
        without cloning the dataset. There is no big differences between the
        variants. Both should produce identical results.

        **save_to** (`str`; default is `None`): the file name where the
        predictions will be saved.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        Returns the corpus with tags predicted in the DEPREL field.
        """
        assert self._ds is not None, \
               "ERROR: The tagger doesn't have a dataset. Call the train() " \
               'method first'
        assert not with_orig or save_to is None, \
               'ERROR: `with_orig` can be True only if save_to is None'
        args, kwargs = get_func_params(FeatTagger.predict, locals())

        kwargs['save_to'] = None

        corpus2 = None
        if self._supp_tagger:
            kwargs2 = kwargs.copy()
            kwargs2['with_orig'] = True
            corpus2 = self._supp_tagger.predict(*args, **kwargs2)

        corpus = super().predict(*args, **kwargs)

        def add_root(corpus):
            for sent in corpus:
                sent0 = sent[0] if isinstance(sent, tuple) \
                               and not isinstance(sent[0], tuple) \
                               and not isinstance(sent[1], tuple) else \
                        sent
                if isinstance(sent0, tuple):
                    sent0 = sent0[0]
                if corpus2:
                    sent2 = next(corpus2)[1]
                    if isinstance(sent2, tuple):
                        sent2 = sent2[0]
                for i, tok in enumerate(sent0):
                    if tok['HEAD'] == '0':
                        tok['DEPREL'] = 'root'
                    elif corpus2 and tok['DEPREL'] == 'root':
                        tok['DEPREL'] = sent2[i]['DEPREL']
                yield sent

        corpus = add_root(corpus)

        if save_to:
            self.save_conllu(corpus, save_to, log_file=None)
            corpus = self._get_corpus(save_to, asis=True, log_file=log_file)

        return corpus
Beispiel #10
0
    def predict(self,
                corpus,
                with_orig=False,
                batch_size=BATCH_SIZE,
                split=None,
                clone_ds=False,
                save_to=None,
                log_file=LOG_FILE):
        """Predicts tags in the DEPREL field of the corpus.

        Args:

        **corpus**: a corpus which will be used for feature extraction and
        predictions. May be either a name of the file in *CoNLL-U* format or a
        list/iterator of sentences in *Parsed CoNLL-U*.

        **with_orig** (`bool`): if `True`, instead of only a sequence with
        predicted labels, returns a sequence of tuples where the first element
        is a sentence with predicted labels and the second element is the
        original sentence. `with_orig` can be `True` only if `save_to` is
        `None`. Default `with_orig=False`.

        **batch_size** (`int`): number of sentences per batch. Default
        `batch_size=64`.

        **split** (`int`): number of lines in each split. Allows to process a
        large dataset in pieces ("splits"). Default `split=None`, i.e. process
        full dataset without splits.

        **clone_ds** (`bool`): if `True`, the dataset is cloned and
        transformed. If `False`, `transform_collate` is used without cloning
        the dataset. There is no big differences between the variants. Both
        should produce identical results.

        **save_to**: file name where the predictions will be saved.

        **log_file**: a stream for info messages. Default is `sys.stdout`.

        Returns corpus with tags predicted in the DEPREL field.
        """
        assert self._ds is not None, \
               "ERROR: The tagger doesn't have a dataset. Call the train() " \
               'method first'
        assert self._model, \
               "ERROR: The tagger doesn't have a model. Call the train() " \
               'method first'
        assert not with_orig or save_to is None, \
               'ERROR: `with_orig` can be True only if save_to is None'
        args, kwargs = get_func_params(DeprelTagger.predict, locals())

        if self._feats_prune_coef != 0:
            key_vals = self._ds.get_dataset('t_0').transform_dict
            corpus = self._get_corpus(corpus, asis=True, log_file=log_file)
            corpus = self._transform_upos(corpus, key_vals=key_vals)

        field = 'DEPREL'
        add_fields = self._normalize_field_names('UPOS')

        def process(corpus):

            corpus = self._get_corpus(corpus, asis=True, log_file=log_file)
            device = next(self._model.parameters()).device or junky.CPU

            ds_y = self._ds.get_dataset('y')
            if clone_ds:
                ds = self._ds.clone()
                ds.remove('y')

            for start in itertools.count(step=split if split else 1):
                if isinstance(corpus, Iterator):
                    if split:
                        corpus_ = []
                        for i, sentence in enumerate(corpus, start=1):
                            corpus_.append(sentence)
                            if i == split:
                                break
                    else:
                        corpus_ = list(corpus)
                else:
                    if split:
                        corpus_ = corpus[start:start + split]
                    else:
                        corpus_ = corpus
                if not corpus_:
                    break

                orig_corpus = corpus_
                corpus_, _, restore_data = self._preprocess_corpus(corpus_)

                res = \
                    junky.extract_conllu_fields(
                        corpus_, fields=add_fields,
                        with_empty=True, return_nones=True
                    )
                sentences, tags, empties, nones = \
                    res[0], res[1:-2], res[-2], res[-1]
                if clone_ds:
                    self._transform(sentences,
                                    tags=tags,
                                    batch_size=batch_size,
                                    ds=ds,
                                    log_file=log_file)
                    loader = ds.create_loader(batch_size=batch_size,
                                              shuffle=False)
                else:
                    loader = self._transform_collate(sentences,
                                                     tags=tags,
                                                     batch_size=batch_size,
                                                     log_file=log_file)
                preds = []
                for batch in loader:
                    batch = junky.to_device(batch, device)
                    with torch.no_grad():
                        pred = self._model(*batch)
                    pred_indices = pred.argmax(-1)
                    preds.extend(pred_indices.cpu().numpy().tolist())
                values = ds_y.reconstruct(preds)
                if with_orig:
                    res_corpus_ = deepcopy(orig_corpus)
                    res_corpus_ = \
                        self._postprocess_corpus(res_corpus_,
                                                 values, restore_data)
                    for orig_sentence, sentence in zip(orig_corpus,
                                                       res_corpus_):
                        yield sentence, orig_sentence
                else:
                    res_corpus_ = \
                        self._postprocess_corpus(orig_corpus,
                                                 values, restore_data)
                    for sentence in res_corpus_:
                        yield sentence

        corpus = process(corpus)

        if self._feats_prune_coef != 0:
            corpus = self._restore_upos(corpus)

        if save_to:
            self.save_conllu(corpus, save_to, log_file=None)
            corpus = self._get_corpus(save_to, asis=True, log_file=log_file)
        return corpus
Beispiel #11
0
    def predict(self,
                corpus,
                use_cdict_coef=False,
                with_orig=False,
                batch_size=BATCH_SIZE,
                split=None,
                clone_ds=False,
                save_to=None,
                log_file=LOG_FILE,
                **_):
        """Predicts values in the certain feature of the key-value type field
        of the specified **corpus**.

        Args:

        **corpus**: a corpus which will be used for feature extraction and
        predictions. May be either a name of the file in *CoNLL-U* format or a
        list/iterator of sentences in *Parsed CoNLL-U*.

        **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`,
        we use our prediction only. If `True`, we replace our prediction to
        the value returned by the `corpuscula.CorpusDict.predict_<field>()`
        method if its `coef` >= `.99`. Also, you can specify your own
        threshold as the value of the param.

        **with_orig** (`bool`): if `True`, instead of only a sequence with
        predicted labels, returns a sequence of tuples where the first element
        is a sentence with predicted labels and the second element is the
        original sentence. `with_orig` can be `True` only if `save_to` is
        `None`. Default `with_orig=False`.

        **batch_size** (`int`): number of sentences per batch. Default
        `batch_size=64`.

        **split** (`int`): number of lines in each split. Allows to process a
        large dataset in pieces ("splits"). Default `split=None`, i.e. process
        full dataset without splits.

        **clone_ds** (`bool`): if `True`, the dataset is cloned and
        transformed. If `False`, `transform_collate` is used without cloning
        the dataset. There is no big differences between the variants. Both
        should produce identical results.

        **save_to**: file name where the predictions will be saved.

        **log_file**: a stream for info messages. Default is `sys.stdout`.

        Returns the corpus with predicted values of certain feature in the
        key-value type field.
        """
        assert self._ds is not None, \
               "ERROR: The tagger doesn't have a dataset. Call the train() " \
               'method first'
        assert not with_orig or save_to is None, \
               'ERROR: `with_orig` can be True only if save_to is None'
        args, kwargs = get_func_params(FeatTagger.predict, locals())

        if self._feats_prune_coef != 0:
            kwargs['save_to'] = None
            key_vals = self._ds.get_dataset('t_0').transform_dict
            corpus = self._get_corpus(corpus, asis=True, log_file=log_file)
            corpus = self._transform_upos(corpus, key_vals=key_vals)

        corpus = super().predict(self._field, 'UPOS', corpus, **kwargs)

        if self._feats_prune_coef != 0:
            corpus = self._restore_upos(corpus)

            if save_to:
                self.save_conllu(corpus, save_to, log_file=None)
                corpus = self._get_corpus(save_to,
                                          asis=True,
                                          log_file=log_file)

        return corpus
Beispiel #12
0
    def __init__(self,
                 num_labels,
                 labels_pad_idx=-100,
                 vec_emb_dim=None,
                 alphabet_size=0,
                 char_pad_idx=0,
                 rnn_emb_dim=None,
                 cnn_emb_dim=None,
                 cnn_kernels=[1, 2, 3, 4, 5, 6],
                 tag_emb_params=None,
                 emb_bn=True,
                 emb_do=.2,
                 final_emb_dim=512,
                 pre_bn=True,
                 pre_do=.5,
                 lstm_layers=1,
                 lstm_do=0,
                 tran_layers=None,
                 tran_heads=8,
                 post_bn=True,
                 post_do=.4):
        if isinstance(cnn_kernels, Iterable):
            cnn_kernels = list(cnn_kernels)
        args, kwargs = get_func_params(BaseTaggerModel.__init__, locals())
        super().__init__(*args, **kwargs)

        assert final_emb_dim % 2 == 0, \
            'ERROR: `final_emb_dim` must be even ' \
           f"(now it's `{final_emb_dim}`)."

        self.num_labels = num_labels
        self.labels_pad_idx = labels_pad_idx
        if vec_emb_dim is None:
            vec_emb_dim = 0
        self.vec_emb_dim = vec_emb_dim

        if rnn_emb_dim:
            self.rnn_emb_l = \
                CharEmbeddingRNN(alphabet_size=alphabet_size,
                                 emb_dim=rnn_emb_dim,
                                 pad_idx=char_pad_idx)
        else:
            self.rnn_emb_l = None
            rnn_emb_dim = 0

        if cnn_emb_dim:
            self.cnn_emb_l = \
                CharEmbeddingCNN(alphabet_size=alphabet_size,
                                 emb_dim=cnn_emb_dim,
                                 pad_idx=char_pad_idx,
                                 kernels=cnn_kernels)
        else:
            self.cnn_emb_l = None
            cnn_emb_dim = 0

        self.tag_emb_l = self.tag_emb_ls = None
        tag_emb_dim = 0
        if tag_emb_params:
            if isinstance(tag_emb_params, dict):
                tag_emb_dim = tag_emb_params['dim'] or 0
                if tag_emb_dim:
                    self.tag_emb_l = \
                        nn.Embedding(tag_emb_params['num'], tag_emb_dim,
                                     padding_idx=tag_emb_params['pad_idx'])
            else:
                self.tag_emb_ls = nn.ModuleList()
                for emb_params in tag_emb_params:
                    tag_emb_dim_ = emb_params['dim']
                    if tag_emb_dim_:
                        tag_emb_dim += tag_emb_dim_
                        self.tag_emb_ls.append(
                            nn.Embedding(emb_params['num'],
                                         tag_emb_dim_,
                                         padding_idx=emb_params['pad_idx']))
                    else:
                        self.tag_emb_ls.append(None)

        joint_emb_dim = vec_emb_dim + rnn_emb_dim + cnn_emb_dim + tag_emb_dim
        assert joint_emb_dim, \
            'ERROR: At least one of `*_emb_dim` must be specified.'

        # PREPROCESS #########################################################
        modules = OrderedDict()
        if emb_bn:
            modules['emb_bn'] = BatchNorm(num_features=joint_emb_dim)
        if emb_do:
            modules['emb_do'] = nn.Dropout(p=emb_do)

        layers = []

        def add_layers(dim, new_dim):
            ls = []
            ls.append(
                ('pre_fc{}_l', nn.Linear(in_features=new_dim,
                                         out_features=dim)))
            if pre_bn:
                ls.append(('pre_bn{}', BatchNorm(num_features=dim)))
            ls.append(('pre_nl{}', nn.ReLU()))
            if pre_do:
                ls.append(('pre_do{}', nn.Dropout(p=pre_do)))
            layers.append(ls)

        dim = final_emb_dim
        while joint_emb_dim / dim > 2:
            new_dim = int(dim * 1.5)
            add_layers(dim, new_dim)
            dim = new_dim
        add_layers(dim, joint_emb_dim)
        for idx, layer in enumerate(reversed(layers)):
            for name, module in layer:
                modules[name.format(idx)] = module

        #modules['pre_fc_l'] = nn.Linear(in_features=dim,
        #                                out_features=final_emb_dim)
        #if pre_bn:
        #    modules['pre_bn'] = BatchNorm(num_features=final_emb_dim)
        #modules['pre_nl'] = nn.ReLU()
        #if pre_do:
        #    modules['pre_do'] = nn.Dropout(p=pre_do)
        self.pre_seq_l = nn.Sequential(modules)
        ######################################################################

        if lstm_layers:
            self.lstm_l = nn.LSTM(input_size=final_emb_dim,
                                  hidden_size=final_emb_dim // 2,
                                  num_layers=lstm_layers,
                                  batch_first=True,
                                  dropout=lstm_do,
                                  bidirectional=True)
            self.T = nn.Linear(final_emb_dim, final_emb_dim)
            nn.init.constant_(self.T.bias, -1)
        else:
            self.lstm_l = None

        if tran_layers:
            tran_enc_l = nn.TransformerEncoderLayer(
                final_emb_dim,
                tran_heads,
                dim_feedforward=2048,
                dropout=0.1,
                activation='relu'  #, layer_norm_eps=1e-5
            )
            tran_norm_l = nn.LayerNorm(normalized_shape=final_emb_dim,
                                       eps=1e-6,
                                       elementwise_affine=True)
            self.tran_l = nn.TransformerEncoder(tran_enc_l,
                                                tran_layers,
                                                norm=tran_norm_l)
        else:
            self.tran_l = None

        # POSTPROCESS ########################################################
        modules = OrderedDict()
        if post_bn:
            modules['post_bn'] = BatchNorm(num_features=final_emb_dim)
        if post_do:
            modules['post_do'] = nn.Dropout(p=post_do)

        modules['out_fc_l'] = nn.Linear(in_features=final_emb_dim,
                                        out_features=num_labels)
        self.post_seq_l = nn.Sequential(modules)
        ######################################################################

        setattr(self, CONFIG_ATTR, (args, kwargs))
Beispiel #13
0
    def predict(self, corpus, use_cdict_coef=False, with_orig=False,
                batch_size=BATCH_SIZE, split=None, clone_ds=False,
                save_to=None, log_file=LOG_FILE):
        """Predicts tags in the LEMMA field of the corpus.

        Args:

        **corpus**: the corpus which will be used for the feature extraction
        and predictions. May be either the name of the file in *CoNLL-U*
        format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*.

        **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`,
        we use our prediction only. If `True`, we replace our prediction to
        the value returned by the `corpuscula.CorpusDict.predict_<field>()`
        method if its `coef` >= `.99`. Also, you can specify your own
        threshold as the value of the param.

        **with_orig** (`bool`; default is `False`): if `True`, instead of just
        the sequence with predicted labels, return the sequence of tuples
        where the first element is the sentence with predicted labels and the
        second element is the original sentence. **with_orig** can be `True`
        only if **save_to** is `None`.

        **batch_size** (`int`; default is `64`): the number of sentences per
        batch.

        **split** (`int`; default is `None`): the number of lines in sentences
        split. Allows to process a large dataset in pieces ("splits"). If
        **split** is `None` (default), all the dataset is processed without
        splits.

        **clone_ds** (`bool`; default is `False`): if `True`, the dataset is
        cloned and transformed. If `False`, `transform_collate` is used
        without cloning the dataset. There is no big differences between the
        variants. Both should produce identical results.

        **save_to** (`str`; default is `None`): the file name where the
        predictions will be saved.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        Returns the corpus with lemmata predicted.
        """
        assert self._ds is not None, \
               "ERROR: The tagger doesn't have a dataset. Call the train() " \
               'method first'
        assert not with_orig or save_to is None, \
               'ERROR: `with_orig` can be True only if save_to is None'
        args, kwargs = get_func_params(LemmaTagger.predict, locals())
        del kwargs['use_cdict_coef']
        kwargs['save_to'] = None

        cdict = self._cdict
        yof = len([x for x in cdict._wforms if 'ё' in x or 'Ё' in x])
        yol = len([x for x in cdict._lemmata if 'ё' in x or 'Ё' in x])
        remove_yo = yol / yof < 10

        def apply_editops(str_from, upos, ops_t, isfirst):
            if str_from and ops_t not in [None, (None,)]:
                str_from_ = None
                if use_cdict_coef not in [None, False]:
                    str_from_, coef = \
                        cdict.predict_lemma(str_from, upos, isfirst=isfirst)
                    if coef is not None \
                   and coef >= (CDICT_COEF_THRESH \
                                    if use_cdict_coef is True else \
                                use_cdict_coef):
                        str_from = str_from_
                    else:
                        str_from_ = None
                if str_from_ is None:
                    try:
                        ops_p, ops_s, ops_c = ops_t
                        str_from_ = ''.join(reversed(
                            self.apply_editops(reversed(
                                self.apply_editops(str_from, ops_p)
                            ), ops_s)
                        ))
                        if str_from_:
                            str_from = str_from_
                            if 'ё' in str_from or 'Ё' in str_from:
                                if not cdict.lemma_isknown(str_from, upos):
                                    str_from_ = str_from.replace('ё', 'е') \
                                                        .replace('Ё', 'Е')
                                    if remove_yo \
                                    or cdict.lemma_isknown(str_from_, upos):
                                        str_from = str_from_
                    except IndexError:
                        pass
                    if ops_c == _OP_C_LOWER:
                        str_from = str_from.lower()
                    elif ops_c == _OP_C_TITLE:
                        str_from = str_from.title()
            return str_from

        def process(corpus):
            for sentence in corpus:
                sentence_ = sentence[0] if with_orig else sentence
                if isinstance(sentence_, tuple):
                    sentence_ = sentence_[0]
                isfirst = True
                for token in sentence_:
                    id_, form = token['ID'], token['FORM']
                    if form and '-' not in id_:
                        token[self._field] = \
                            apply_editops(form, token['UPOS'],
                                          token[self._field], isfirst=isfirst)
                        isfirst = False
                yield sentence

        key_vals = self._ds.get_dataset('t_0').transform_dict
        corpus = self._get_corpus(corpus, asis=True, log_file=log_file)
        corpus = self._transform_upos(corpus, key_vals=key_vals)
        corpus = super().predict(self._field, 'UPOS', corpus, **kwargs)
        corpus = self._restore_upos(corpus)
        corpus = process(corpus)

        if save_to:
            self.save_conllu(corpus, save_to, log_file=None)
            corpus = self._get_corpus(save_to, asis=True, log_file=log_file)
        return corpus
Beispiel #14
0
    def train(self, save_as, feats=None,
              device=None, control_metric='accuracy', max_epochs=None,
              min_epochs=0, bad_epochs=5, batch_size=TRAIN_BATCH_SIZE,
              max_grad_norm=None, tags_to_remove=None, word_emb_type='bert',
              word_emb_path=None, word_transform_kwargs=None,
                  # BertDataset.transform() (for BERT-descendant models)
                  # params:
                  # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                  #  'aggregate_hiddens_op': 'cat',
                  #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                  #  'loglevel': 1}
                  # WordDataset.transform() (for other models) params:
                  # {'check_lower': True}
              stage1_params=None,
                  # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                  #  'weight_decay': 0, 'amsgrad': False,
                  #  'max_epochs': None, 'min_epochs': None,
                  #  'bad_epochs': None, 'batch_size': None,
                  #  'max_grad_norm': None}
              stage2_params=None,
                  # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                  #  'dampening': 0, 'nesterov': False,
                  #  'max_epochs': None, 'min_epochs': None,
                  #  'bad_epochs': None, 'batch_size': None,
                  #  'max_grad_norm': None}
              stage3_params={'save_as': None},
                  # {'save_as': None, 'epochs': 3, 'batch_size': 8,
                  #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                  #  'weight_decay': .01, 'amsgrad': False,
                  #  'num_warmup_steps': 3, 'max_grad_norm': 1.}
              stages=[1, 2, 3, 1, 2], save_stages=False,
              learn_on_padding=True, remove_padding_intent=False,
              seed=None, keep_embs=False, log_file=LOG_FILE,
              rnn_emb_dim=None, cnn_emb_dim=200, cnn_kernels=range(1, 7),
              upos_emb_dim=200, emb_bn=True, emb_do=.2,
              final_emb_dim=512, pre_bn=True, pre_do=.5,
              lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
              post_bn=True, post_do=.4):
        """Creates and trains the separate feature tagger model.

        *Training's args*:

        **save_as** (`str`): the name using for save the model's head. Refer
        to the `.save()` method's help for the broad definition (see the
        **name** arg there).

        **feats** (`str | list([str])`; default is `None`): one or several
        subfields of the key-value type fields like `FEATS` or `MISC` to be
        predicted separatedly.

        **device** (`str`; default is `None`): the device for the model. E.g.:
        'cuda:0'. If `None`, we don't move the model to any device (it is
        placed right where it's created).

        **control_metric** (`str`; default is `accuracy`): the metric that
        control training. Any that is supported by the `junky.train()` method.
        In the moment, it is: 'accuracy', 'f1', 'loss', 'precision', and
        'recall'.

        **max_epochs** (`int`; default is `None`): the maximal number of
        epochs for the model's head training (stages types `1` and `2`). If
        `None` (default), the training would be linger until **bad_epochs**
        has met, but no less than **min_epochs**.

        **min_epochs** (`int`; default is `0`): the minimal number of training
        epochs for the model's head training (stages types `1` and `2`).

        **bad_epochs** (`int`; default is `5`): the maximal allowed number of
        bad epochs (epochs when chosen **control_metric** is not became
        better) in a row for the model's head training (stages types `1` and
        `2`).

        **batch_size** (`int`; default is `32`): the number of sentences per
        batch for the model's head training (stages types `1` and `2`).

        **max_grad_norm** (`float`; default is `None`): the gradient clipping
        parameter for the model's head training (stages types `1` and `2`).

        **tags_to_remove** (`dict({str: str}) | dict({str: list([str])})`;
        default is `None`): the tags, tokens with those must be removed from
        the corpus. It's the `dict` with field names as keys and values you
        want to remove. Applied only to fields with atomic values (like
        *UPOS*). This argument may be used, for example, to remove some
        infrequent or just excess tags from the corpus. Note, that we remove
        the tokens from the train corpus completely, not just replace those
        tags to `None`.

        *Word embedding params*:

        **word_emb_type** (`str`; default is `'bert'`): one of (`'bert'` |
        `'glove'` | `'ft'` | `'w2v'`) embedding types.

        **word_emb_path** (`str`): the path to the word embeddings storage.

        **word_transform_kwargs** (`dict`; default is `None`): keyword
        arguments for the `.transform()` method of the dataset created for
        sentences to word embeddings conversion. See the `.transform()` method
        of either `junky.datasets.BertDataset` (if **word_emb_path** is
        `'bert'`) or `junky.datasets.WordDataset` (otherwise) if you want to
        learn allowed values for the parameter. If `None`, the `.transform()`
        method use its defaults.

        *Training stages params*:

        **stage1_param** (`dict`; default is `None`): keyword arguments for
        the `BaseModel.adjust_model_for_train()` method. If `None`, the
        defaults are used. Also, you can specify here new values for the
        arguments **max_epochs**, **min_epochs**, **bad_epochs**,
        **batch_size**, and **max_grad_norm** that will be used only on stages
        of type `1`.

        **stage2_param** (`dict`; default is `None`): keyword arguments for
        the `BaseModel.adjust_model_for_tune()` method. If `None`, the
        defaults are used. Also, you can specify here new values for the
        arguments **max_epochs**, **min_epochs**, **bad_epochs**,
        **batch_size**, and **max_grad_norm** that will be used only on stages
        of type `2`.

        **stage3_param** (`dict`; default is `None`): keyword arguments for
        the `WordEmbeddings.full_tune()` method. If `None`, the defaults are
        used.

        **stages** (`list([int]`; default is `[1, 2, 3, 1, 2]`): what stages
        we should use during training and in which order. On the stage type
        `1` the model head is trained with *Adam* optimizer; the stage type
        `2` is similar, but the optimizer is *SGD*; the stage type `3` is only
        relevant when **word_emb_type** is `'bert'` and we want to tune the
        whole model. Stage type `0` defines the skip-stage, i.e. there would
        be no real training on it. It is used when you need reproducibility
        and want to continue train the model from some particular stage. In
        this case, you specify the name of the model saved on that stage in
        the parametere **load_from**, and put zeros into the **stages** list
        on the places of already finished ones. One more time: it is used for
        reproducibility only, i.e. when you put some particular value to the
        **seed** param and want the data order in bathes be equivalent with
        data on the stages from the past trainings.

        **save_stages** (`bool`; default is `False`): if we need to keep the
        best model of each stage beside of the overall best model. The names
        of these models would have the suffix `_<idx>(stage<stage_type>)`
        where `<idx>` is an ordinal number of the stage. We can then use it to
        continue training from any particular stage number (changing next
        stages or their parameters) using the parameter **load_from** (with
        separate tagger the tuning is more complicated: one have to load and
        tune each model separately as standalone instance of the `FeatTagger`
        class; the `FeatsSeparateTagger.predict()` method doesn't even have
        **load_from** argument). Note that we save only stages of the head
        model. The embedding model as a part of the full model usually tune
        only once, so we don't make its copy.

        *Other options*:

        **learn_on_padding** (`bool`; default is `True`): while training, we
        can calculate loss either taking in account predictions made for
        padding tokens or without it. The common practice is don't use padding
        when calculate loss. However, we note that using padding sometimes
        makes the resulting model performance slightly better.

        **remove_padding_intent** (`bool`; default is `False`): if you set
        **learn_on_padding** param to `False`, you may want not to use padding
        intent during training at all. I.e. padding tokens would be tagged
        with some of real tags, and they would just ignored during computing
        loss. As a result, the model would have the output dimensionality of
        the final layer less by one. On the first sight, such approach could
        increase the performance, but in our experiments, such effect appeared
        not always.

        **seed** (`int`; default is `None`): init value for the random number
        generator if you need reproducibility. Note that each stage will have
        its own seed value, and the **seed** param is used to calculate these
        values.

        **keep_embs** (`bool`; default is `False`): by default, after creating
        `Dataset` objects, we remove word embedding models to free memory.
        With `keep_embs=False` this operation is omitted, and you can use
        `.embs` attribute for share embedding models with other objects.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        *The model hyperparameters*:

        **rnn_emb_dim** (`int`; default is `None`): the internal character RNN
        (LSTM) embedding dimensionality. If `None`, the layer is skipped.

        **cnn_emb_dim** (`int`; default is `200`): the internal character CNN
        embedding dimensionality. If `None`, the layer is skipped.

        **cnn_kernels** (`list([int])`; default is `[1, 2, 3, 4, 5, 6]`): CNN
        kernel sizes of the internal CNN embedding layer. Relevant if
        **cnn_emb_dim** is not `None`.

        **upos_emb_dim** (`int`; default is `200`): the auxiliary UPOS label
        embedding dimensionality.

        **emb_bn** (`bool`; default is 'True'): whether batch normalization
        layer should be applied after the embedding concatenation.

        **emb_do** (`float`; default is '.2'): the dropout rate after the
        embedding concatenation.

        **final_emb_dim** (`int`; default is `512`): the output dimesionality
        of the linear transformation applying to concatenated embeddings.

        **pre_bn** (`bool`; default is 'True'): whether batch normalization
        layer should be applied before the main part of the algorithm.

        **pre_do** (`float`; default is '.5'): the dropout rate before the
        main part of the algorithm.

        **lstm_layers** (`int`; default is `1`): the number of Bidirectional
        LSTM layers. If `None`, they are not created.

        **lstm_do** (`float`; default is `0`): the dropout between LSTM
        layers. Only relevant, if `lstm_layers` > `1`.

        **tran_layers** (`int`; default is `None`): the number of Transformer
        Encoder layers. If `None`, they are not created.

        **tran_heads** (`int`; default is `8`): the number of attention heads
        of Transformer Encoder layers. Only relevant, if `tran_layers` > `1`.

        **post_bn** (`bool`; default is 'True'): whether batch normalization
        layer should be applied after the main part of the algorithm.

        **post_do** (`float`; default is '.4'): the dropout rate after the
        main part of the algorithm.

        The method returns the train statistics.
        """
        assert self._train_corpus, 'ERROR: Train corpus is not loaded yet'

        start_time = time.time()
        args, kwargs = get_func_params(FeatsSeparateTagger.train, locals())
        del kwargs['feats']
        if 'stage3_params' in kwargs and 'save_as' in kwargs['stage3_params']:
            word_emb_path_suffix = kwargs['stage3_params']['save_as']
            del kwargs['stage3_params']['save_as']
        else:
            word_emb_path_suffix = None

        if log_file:
            print('###### {} TAGGER TRAINING PIPELINE ######'
                      .format(self._field), file=log_file)
            print("\nWe're gonna train separate models for {} {} in train "
                      .format('the requested' if feats else 'all',
                              self._field)
                + 'corpus. Feats are:\n', file=log_file)
        if not feats:
            feats = sorted(set(x for x in self._train_corpus
                                 for x in x
                                 for x in x[self._field].keys()))
        elif isinstance(feats, str):
            feats = [feats]
        if log_file:
            print(', '.join(feats), file=log_file)

        res = {}
        for feat in feats:
            start_time_ = time.time()
            if log_file:
                print(file=log_file)
                clear_tqdm()

            save_as_ = '{}-{}'.format(save_as, feat.lower())
            self._feats[feat] = save_as_

            tagger = FeatTagger(self._field + ':' + feat,
                                feats_prune_coef=self._feats_prune_coef,
                                embs=self.embs)
            tagger._train_corpus, tagger._test_corpus = \
                self._train_corpus, self._test_corpus
            if word_emb_path_suffix:
                kwargs['stage3_params']['save_as'] = \
                    '{}-{}_{}'.format(self._field.lower(), feat.lower(),
                                      word_emb_path_suffix)
            res[feat] = tagger.train(save_as_, **kwargs,
                                     start_time=start_time_)
            del tagger

        self.save(save_as, log_file=log_file)
        if log_file:
            print('\n###### {} TAGGER TRAINING HAS FINISHED ### '
                      .format(self._field)
                + 'Total time: {} ######\n'
                      .format(seconds_to_strtime(time.time() - start_time)),
                  file=log_file)
            print(("Now, check the separate {} models' and datasets' "
                   'config files and consider to change some device names '
                   'to be able to load all the models jointly. You can find '
                   'the separate models\' list in the "{}" config file. '
                   "Then, use the `.load('{}')` method to start working "
                   'with the {} tagger.')
                       .format(self._field, save_as + CONFIG_EXT, save_as,
                               self._field),
                      file=log_file)
        return res
Beispiel #15
0
    def predict(self, corpus, use_cdict_coef=False, with_orig=False,
                batch_size=BATCH_SIZE, split=None, clone_ds=False,
                save_to=None, log_file=LOG_FILE, **_):
        """Predicts feature keys and values in the key-value type field of the
        corpus.

        Args:

        **corpus**: the corpus which will be used for the feature extraction
        and predictions. May be either the name of the file in *CoNLL-U*
        format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*.

        **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`,
        we use our prediction only. If `True`, we replace our prediction to
        the value returned by the `corpuscula.CorpusDict.predict_<field>()`
        method if its `coef` >= `.99`. Also, you can specify your own
        threshold as the value of the param.

        **with_orig** (`bool`; default is `False`): if `True`, instead of just
        the sequence with predicted labels, return the sequence of tuples
        where the first element is the sentence with predicted labels and the
        second element is the original sentence. **with_orig** can be `True`
        only if **save_to** is `None`.

        **batch_size** (`int`; default is `64`): the number of sentences per
        batch.

        **split** (`int`; default is `None`): the number of lines in sentences
        split. Allows to process a large dataset in pieces ("splits"). If
        **split** is `None` (default), all the dataset is processed without
        splits.

        **clone_ds** (`bool`; default is `False`): if `True`, the dataset is
        cloned and transformed. If `False`, `transform_collate` is used
        without cloning the dataset. There is no big differences between the
        variants. Both should produce identical results.

        **save_to** (`str`; default is `None`): the file name where the
        predictions will be saved.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        Returns the corpus with feature keys and values predicted in the FEATS
        field.
        """
        assert not with_orig or save_to is None, \
               'ERROR: `with_orig` can be True only if save_to is None'
        args, kwargs = get_func_params(FeatsJointTagger.predict, locals())
        kwargs['save_to'] = None

        def process(corpus):
            for sentence in corpus:
                sentence_ = sentence[0] if with_orig else sentence
                if isinstance(sentence_, tuple):
                    sentence_ = sentence_[0]
                for token in sentence_:
                    token[self._field] = OrderedDict(
                        [(x, y) for x, y in [
                            x.split('=')
                                for x in token[self._field].split('|')
                        ]]
                    ) if token[self._field] else OrderedDict()
                yield sentence

        corpus = process(
            super().predict(self._field, 'UPOS', *args, **kwargs)
        )
        if save_to:
            self.save_conllu(corpus, save_to, log_file=None)
            corpus = self._get_corpus(save_to, asis=True, log_file=log_file)
        return corpus
Beispiel #16
0
    def predict(self, corpus, feats=None, remove_excess_feats=True,
                use_cdict_coef=False, with_orig=False, batch_size=BATCH_SIZE,
                split=None, clone_ds=False, save_to=None, log_file=LOG_FILE):
        """Predicts feature keys and values in the FEATS field of the corpus.

        Args:

        **corpus**: the corpus which will be used for the feature extraction
        and predictions. May be either the name of the file in *CoNLL-U*
        format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*.

        **feats** (`str | list([str])`; default is `None`): one or several
        subfields of the key-value type fields like `FEATS` or `MISC` to be
        predicted separatedly.

        **remove_excess_feats** (`bool`): if `True` (default), the tagger
        removes all unrelevant features from the predicted field ("unrelevant"
        means, that the tagger don't have a models for them). For example, if
        you trained the tagger only for "Case" and "Gender" features, the
        tagger predicts only them (or, only one of them, if you specify it in
        the **feats** field) and removes all the rest. Elsewise, if
        **remove_excess_feats** is `False`, all unrelevant feats are stayed
        intact.

        **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`,
        we use our prediction only. If `True`, we replace our prediction to
        the value returned by the `corpuscula.CorpusDict.predict_<field>()`
        method if its `coef` >= `.99`. Also, you can specify your own
        threshold as the value of the param.

        **with_orig** (`bool`; default is `False`): if `True`, instead of just
        the sequence with predicted labels, return the sequence of tuples
        where the first element is the sentence with predicted labels and the
        second element is the original sentence. **with_orig** can be `True`
        only if **save_to** is `None`.

        **batch_size** (`int`; default is `64`): the number of sentences per
        batch.

        **split** (`int`; default is `None`): the number of lines in sentences
        split. Allows to process a large dataset in pieces ("splits"). If
        **split** is `None` (default), all the dataset is processed without
        splits.

        **clone_ds** (`bool`; default is `False`): if `True`, the dataset is
        cloned and transformed. If `False`, `transform_collate` is used
        without cloning the dataset. There is no big differences between the
        variants. Both should produce identical results.

        **save_to** (`str`; default is `None`): the file name where the
        predictions will be saved.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        Returns the corpus with feature keys and values predicted in the FEATS
        field.
        """
        args, kwargs = get_func_params(FeatsSeparateTagger.predict, locals())
        del kwargs['feats']
        del kwargs['remove_excess_feats']

        if feats is None:
            feats = self._feats
        else:
            if isinstance(feats, str):
                feats = [feats]
            unknown_feats = []
            for feat in sorted(feats):
                if feat not in self._feats:
                    unknown_feats.append(feat)
            assert unknown_feats, \
                'ERROR: feats {} are unknown for the tagger' \
                    .format(unknown_feats)

        kwargs['with_orig'] = False
        kwargs['save_to'] = None

        def process(corpus):
            corpus = self._get_corpus(corpus, asis=True,
                                      log_file=log_file)
            for start in itertools.count(step=split if split else 1):
                if isinstance(corpus, Iterator):
                    if split:
                        corpus_ = []
                        for i, sentence in enumerate(corpus, start=1):
                            corpus_.append(sentence)
                            if i == split:
                                break
                    else:
                        corpus_ = list(corpus)
                else:
                    if split:
                        corpus_ = corpus[start:start + split]
                    else:
                        corpus_ = corpus
                if not corpus_:
                    break

                if remove_excess_feats:
                    for sentence in corpus_:
                        for token in sentence[0] \
                                         if isinstance(sentence, tuple) else \
                                     sentence:
                            token[self._field] = OrderedDict(
                                (x, y) for x, y in token[self._field].items()
                                           if x in feats
                            )

                res_corpus_ = deepcopy(corpus_) if with_orig else corpus_

                for attrs in self._feats.values():
                    tagger = attrs[1] \
                                 if isinstance(attrs, list) else \
                             None
                    assert isinstance(tagger, FeatTagger), \
                        'ERROR: Model is not loaded. Use the .load() ' \
                        'method prior'
                    res_corpus_ = tagger.predict(res_corpus_, **kwargs)

                if with_orig:
                    for orig_sentence, sentence in zip(corpus_,
                                                       res_corpus_):
                        yield sentence, orig_sentence
                else:
                    for sentence in res_corpus_:
                        yield sentence

        corpus = process(corpus)
        if save_to:
            self.save_conllu(corpus, save_to, log_file=None)
            corpus = self._get_corpus(save_to, asis=True, log_file=log_file)
        return corpus