def evaluate(self, gold, test=None, feats=None, label=None, use_cdict_coef=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, log_file=LOG_FILE): """Evaluate the tagger model. Args: **gold**: the corpus of sentences with actual target values to score the tagger on. May be either the name of the file in *CoNLL-U* format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*. **test** (default is `None`): the corpus of sentences with predicted target values. If `None` (default), the **gold** corpus will be retagged on-the-fly, and the result will be used as the **test**. **feats** (`str | list([str])`; default is `None`): one or several subfields of the key-value type fields like `FEATS` or `MISC` to be evaluated separatedly. **label** (`str`; default is `None`): the specific label of the target field to be evaluated separatedly, e.g. `field='UPOS', label='VERB'` or `field='FEATS:Animacy', label='Inan'`. **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`, we use our prediction only. If `True`, we replace our prediction to the value returned by the `corpuscula.CorpusDict.predict_<field>()` method if its `coef` >= `.99`. Also, you can specify your own threshold as the value of the param. **batch_size** (`int`; default is `64`): the number of sentences per batch. **split** (`int`; default is `None`): the number of lines in sentences split. Allows to process a large dataset in pieces ("splits"). If **split** is `None` (default), all the dataset is processed without splits. **clone_ds** (`bool`; default is `False`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. The method prints metrics and returns evaluation accuracy. """ assert not label or feats, \ 'ERROR: To evaluate the exact label you must specify its ' \ 'feat, too' assert not label or not feats \ or isinstance(feats, str) or len(feats) == 1, \ 'ERROR: To evaluate the exact label you must specify its own ' \ 'feat only' args, kwargs = get_func_params(FeatsSeparateTagger.evaluate, locals()) field = self._field if label: del kwargs['feats'] field += ':' + (feats if isinstance(feats, str) else feats[0]) return super().evaluate(field, *args, **kwargs)
def load(self, name, device=None, dataset_emb_path=None, dataset_device=None, log_file=LOG_FILE): """Loads tagger's internal state saved by its `.save()` method. Args: **name** (`str`): name of the previously saved internal state. **device**: a device for the loaded model if you want to override the value from config. **dataset_emb_path**: a path where dataset's embeddings to load from if you want to override the value from config. **dataset_device**: a device for the loaded dataset if you want to override the value from config. **log_file**: a stream for info messages. Default is `sys.stdout`. """ args, kwargs = get_func_params(DeprelTagger.load, locals()) super(self.__class__.__base__, self).load(DeprelTaggerModel, *args, **kwargs)
def __init__(self, num_labels, labels_pad_idx=-100, vec_emb_dim=None, alphabet_size=0, char_pad_idx=0, rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=[1, 2, 3, 4, 5, 6], emb_bn=True, emb_do=.2, final_emb_dim=512, pre_bn=True, pre_do=.5, lstm_layers=1, lstm_do=0, tran_layers=None, tran_heads=8, post_bn=True, post_do=.4): if isinstance(cnn_kernels, Iterable): cnn_kernels = list(cnn_kernels) args, kwargs = get_func_params(UposTaggerModel.__init__, locals()) super().__init__(*args, **kwargs) setattr(self, CONFIG_ATTR, (args, kwargs))
def predict(self, corpus, use_cdict_coef=False, with_orig=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, save_to=None, log_file=LOG_FILE): """Predicts tags in the UPOS field of the corpus. Args: **corpus**: the corpus which will be used for the feature extraction and predictions. May be either the name of the file in *CoNLL-U* format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*. **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`, we use our prediction only. If `True`, we replace our prediction to the value returned by the `corpuscula.CorpusDict.predict_<field>()` method if its `coef` >= `.99`. Also, you can specify your own threshold as the value of the param. **with_orig** (`bool`; default is `False`): if `True`, instead of just the sequence with predicted labels, return the sequence of tuples where the first element is the sentence with predicted labels and the second element is the original sentence. **with_orig** can be `True` only if **save_to** is `None`. **batch_size** (`int`; default is `64`): the number of sentences per batch. **split** (`int`; default is `None`): the number of lines in sentences split. Allows to process a large dataset in pieces ("splits"). If **split** is `None` (default), all the dataset is processed without splits. **clone_ds** (`bool`; default is `False`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **save_to** (`str`; default is `None`): the file name where the predictions will be saved. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. Returns the corpus with tags predicted in the UPOS field. """ assert self._field == 'UPOS' or use_cdict_coef in [None, False], \ 'ERROR: "use_cdict_coef" param may be used only with UPOS field' args, kwargs = get_func_params(UposTagger.predict, locals()) return super().predict(self._field, None, *args, **kwargs)
def __init__(self, num_labels, labels_pad_idx=-100, vec_emb_dim=None, alphabet_size=0, char_pad_idx=0, rnn_emb_dim=None, cnn_emb_dim=200, cnn_kernels=[1, 2, 3, 4, 5, 6], upos_emb_dim=200, upos_num=0, upos_pad_idx=0, emb_bn=True, emb_do=.2, final_emb_dim=512, pre_bn=True, pre_do=.5, lstm_layers=1, lstm_do=0, tran_layers=None, tran_heads=8, post_bn=True, post_do=.4): if isinstance(cnn_kernels, Iterable): cnn_kernels = list(cnn_kernels) args, kwargs = get_func_params(FeatTaggerModel.__init__, locals()) kwargs_ = {x: y for x, y in kwargs.items() if x not in [ 'upos_emb_dim', 'upos_num', 'upos_pad_idx' ]} if upos_emb_dim: kwargs_['tag_emb_params'] = { 'dim': upos_emb_dim, 'num': upos_num, 'pad_idx': upos_pad_idx } super().__init__(*args, **kwargs_) setattr(self, CONFIG_ATTR, (args, kwargs))
def load(self, name, device=None, dataset_emb_path=None, dataset_device=None, log_file=LOG_FILE): """Loads tagger's internal state saved by its `.save()` method. Args: **name** (`str`): the name of the previously saved internal state. **device** (`str`; default is `None`): the device for the loaded model if you want to override the value from the config. **dataset_emb_path** (`str`; default is `None`): the path where the dataset's embeddings to load from if you want to override the value from the config. **dataset_device** (`str`; default is `None`): the device for the loaded dataset if you want to override the value from the config. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. """ args, kwargs = get_func_params(LemmaTagger.load, locals()) super().load(FeatTaggerModel, *args, **kwargs)
def __init__(self, labels_num, vec_emb_dim=None, alphabet_size=0, char_pad_idx=0, rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=[1, 2, 3, 4, 5, 6], tag_emb_params=None, emb_out_dim=512, lstm_bidirectional=True, lstm_hidden_dim=256, lstm_layers=1, lstm_do=0, bn1=True, do1=.2, bn2=True, do2=.5, bn3=True, do3=.4): if isinstance(cnn_kernels, Iterable): cnn_kernels = list(cnn_kernels) args, kwargs = \ get_func_params(BaseTaggerSequenceModel.__init__, locals()) super().__init__(*args, **kwargs) self.vec_emb_dim = vec_emb_dim if rnn_emb_dim: self._rnn_emb_l = \ CharEmbeddingRNN(alphabet_size=alphabet_size, emb_dim=rnn_emb_dim, pad_idx=char_pad_idx) else: self._rnn_emb_l = None rnn_emb_dim = 0 if cnn_emb_dim: self._cnn_emb_l = \ CharEmbeddingCNN(alphabet_size=alphabet_size, emb_dim=cnn_emb_dim, pad_idx=char_pad_idx, kernels=cnn_kernels) else: self._cnn_emb_l = None cnn_emb_dim = 0 self._tag_emb_l = self._tag_emb_ls = None tag_emb_dim = 0 if tag_emb_params: if isinstance(tag_emb_params, dict): tag_emb_dim = tag_emb_params['dim'] or 0 if tag_emb_dim: self._tag_emb_l = \ nn.Embedding(tag_emb_params['num'], tag_emb_dim, padding_idx=tag_emb_params['pad_idx']) else: self._tag_emb_ls = nn.ModuleList() for emb_params in tag_emb_params: tag_emb_dim_ = emb_params['dim'] if tag_emb_dim_: tag_emb_dim += tag_emb_dim_ self._tag_emb_ls.append( nn.Embedding(emb_params['num'], tag_emb_dim_, padding_idx=emb_params['pad_idx']) ) else: self._tag_emb_ls.append(None) self._bn1 = \ nn.BatchNorm1d(num_features=vec_emb_dim + rnn_emb_dim + cnn_emb_dim + tag_emb_dim) if bn1 else None self._do1 = nn.Dropout(p=do1) if do1 else None self._emb_fc_l = nn.Linear( in_features=vec_emb_dim + rnn_emb_dim + cnn_emb_dim + tag_emb_dim, out_features=emb_out_dim ) self._bn2 = \ nn.BatchNorm1d(num_features=emb_out_dim) if bn2 else None self._do2 = nn.Dropout(p=do2) if do2 else None self._lstm_l = nn.LSTM(input_size=emb_out_dim, hidden_size=lstm_hidden_dim, num_layers=lstm_layers, batch_first=True, dropout=lstm_do, bidirectional=lstm_bidirectional) if lstm_bidirectional: lstm_hidden_dim *= 2 self._T = nn.Linear(emb_out_dim, lstm_hidden_dim) nn.init.constant_(self._T.bias, -1) self._bn3 = \ nn.BatchNorm1d(num_features=lstm_hidden_dim) if bn3 else None self._do3 = nn.Dropout(p=do3) if do3 else None self._out_l = nn.Linear(in_features=lstm_hidden_dim, out_features=labels_num) setattr(self, CONFIG_ATTR, (args, kwargs))
def train(self, save_as, device=None, control_metric='accuracy', max_epochs=None, min_epochs=0, bad_epochs=5, batch_size=TRAIN_BATCH_SIZE, max_grad_norm=None, tags_to_remove=None, word_emb_type='bert', word_emb_path='xlm-roberta-base', word_transform_kwargs=None, # BertDataset.transform() (for BERT-descendant models) # params: # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10', # 'aggregate_hiddens_op': 'cat', # 'aggregate_subtokens_op': 'absmax', 'to': junky.CPU, # 'loglevel': 1} # WordDataset.transform() (for other models) params: # {'check_lower': True} stage1_params=None, # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8, # 'weight_decay': 0, 'amsgrad': False, # 'max_epochs': None, 'min_epochs': None, # 'bad_epochs': None, 'batch_size': None, # 'max_grad_norm': None} stage2_params=None, # {'lr': .001, 'momentum': .9, 'weight_decay': 0, # 'dampening': 0, 'nesterov': False, # 'max_epochs': None, 'min_epochs': None, # 'bad_epochs': None, 'batch_size': None, # 'max_grad_norm': None} stage3_params={'save_as': None}, # {'save_as': None, 'epochs': 3, 'batch_size': 8, # 'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8, # 'weight_decay': .01, 'amsgrad': False, # 'num_warmup_steps': 3, 'max_grad_norm': 1.} stages=[1, 2, 3, 1, 2], save_stages=False, load_from=None, learn_on_padding=True, remove_padding_intent=False, seed=None, start_time=None, keep_embs=False, log_file=LOG_FILE, rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7), emb_bn=True, emb_do=.2, final_emb_dim=512, pre_bn=True, pre_do=.5, lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8, post_bn=True, post_do=.4): """Creates and trains the UPOS tagger model. During training, the best model is saved after each successful epoch. *Training's args*: **save_as** (`str`): the name using for save the model's head. Refer to the `.save()` method's help for the broad definition (see the **name** arg there). **device** (`str`, default is `None`): the device for the model. E.g.: 'cuda:0'. If `None`, we don't move the model to any device (it is placed right where it's created). **control_metric** (`str`; default is `accuracy`): the metric that control training. Any that is supported by the `junky.train()` method. In the moment, it is: 'accuracy', 'f1', 'loss', 'precision', and 'recall'. **max_epochs** (`int`; default is `None`): the maximal number of epochs for the model's head training (stages types `1` and `2`). If `None` (default), the training would be linger until **bad_epochs** has met, but no less than **min_epochs**. **min_epochs** (`int`; default is `0`): the minimal number of training epochs for the model's head training (stages types `1` and `2`). **bad_epochs** (`int`; default is `5`): the maximal allowed number of bad epochs (epochs when chosen **control_metric** is not became better) in a row for the model's head training (stages types `1` and `2`). **batch_size** (`int`; default is `32`): the number of sentences per batch for the model's head training (stages types `1` and `2`). **max_grad_norm** (`float`; default is `None`): the gradient clipping parameter for the model's head training (stages types `1` and `2`). **tags_to_remove** (`dict({str: str}) | dict({str: list([str])})`; default is `None`): the tags, tokens with those must be removed from the corpus. It's the `dict` with field names as keys and values you want to remove. Applied only to fields with atomic values (like UPOS). This argument may be used, for example, to remove some infrequent or just excess tags from the corpus. Note, that we remove the tokens from the train corpus completely, not just replace those tags to `None`. *Word embedding params*: **word_emb_type** (`str`; default is `'bert'`): one of (`'bert'` | `'glove'` | `'ft'` | `'w2v'`) embedding types. **word_emb_path** (`str`): the path to the word embeddings storage. **word_transform_kwargs** (`dict`; default is `None`): keyword arguments for the `.transform()` method of the dataset created for sentences to word embeddings conversion. See the `.transform()` method of either `junky.datasets.BertDataset` (if **word_emb_path** is `'bert'`) or `junky.datasets.WordDataset` (otherwise) if you want to learn allowed values for the parameter. If `None`, the `.transform()` method use its defaults. *Training stages params*: **stage1_param** (`dict`; default is `None`): keyword arguments for the `BaseModel.adjust_model_for_train()` method. If `None`, the defaults are used. Also, you can specify here new values for the arguments **max_epochs**, **min_epochs**, **bad_epochs**, **batch_size**, and **max_grad_norm** that will be used only on stages of type `1`. **stage2_param** (`dict`; default is `None`): keyword arguments for the `BaseModel.adjust_model_for_tune()` method. If `None`, the defaults are used. Also, you can specify here new values for the arguments **max_epochs**, **min_epochs**, **bad_epochs**, **batch_size**, and **max_grad_norm** that will be used only on stages of type `2`. **stage3_param** (`dict`; default is `None`): keyword arguments for the `WordEmbeddings.full_tune()` method. If `None`, the defaults are used. **stages** (`list([int]`; default is `[1, 2, 3, 1, 2]`): what stages we should use during training and in which order. On the stage type `1` the model head is trained with *Adam* optimizer; the stage type `2` is similar, but the optimizer is *SGD*; the stage type `3` is only relevant when **word_emb_type** is `'bert'` and we want to tune the whole model. Stage type `0` defines the skip-stage, i.e. there would be no real training on it. It is used when you need reproducibility and want to continue train the model from some particular stage. In this case, you specify the name of the model saved on that stage in the parametere **load_from**, and put zeros into the **stages** list on the places of already finished ones. One more time: it is used for reproducibility only, i.e. when you put some particular value to the **seed** param and want the data order in bathes be equivalent with data on the stages from the past trainings. **save_stages** (`bool`; default is `False`): if we need to keep the best model of each stage beside of the overall best model. The names of these models would have the suffix `_<idx>(stage<stage_type>)` where `<idx>` is an ordinal number of the stage. We can then use it to continue training from any particular stage number (changing next stages or their parameters) using the parameter **load_from**. Note that we save only stages of the head model. The embedding model as a part of the full model usually tune only once, so we don't make its copy. **load_from** (`str`; default is `None`): if you want to continue training from one of previously saved stages, you can specify the name of the model from that stage. Note, that if your model is already trained on stage type `3`, then you want to set param **word_emb_path** to `None`. Otherwise, you'll load wrong embedding model. Any other params of the model may be overwritten (and most likely, this would cause error), but they are equivalent when the training is just starts and when it's continues. But the **word_emb_path** is different if you already passed stage type `3`, so don't forget to set it to `None` in that case. (Example: you want to repeat training on stage no `5`, so you specify in the **load_from** param something like `'model_4(stage1)'` and set the **word_emb_path** to `None` and the **stages_param** to `'[0, 0, 0, 0, 2]'` (or, if you don't care of reproducibility, you could just specify `[2]` here). *Other options*: **learn_on_padding** (`bool`; default is `True`): while training, we can calculate loss either taking in account predictions made for padding tokens or without it. The common practice is don't use padding when calculate loss. However, we note that using padding sometimes makes the resulting model performance slightly better. **remove_padding_intent** (`bool`; default is `False`): if you set **learn_on_padding** param to `False`, you may want not to use padding intent during training at all. I.e. padding tokens would be tagged with some of real tags, and they would just ignored during computing loss. As a result, the model would have the output dimensionality of the final layer less by one. On the first sight, such approach could increase the performance, but in our experiments, such effect appeared not always. **seed** (`int`; default is `None`): init value for the random number generator if you need reproducibility. Note that each stage will have its own seed value, and the **seed** param is used to calculate these values. **start_time** (`float`; default is `None`): the result of `time.time()` to start with. If `None`, the arg will be init anew. **keep_embs** (`bool`; default is `False`): by default, after creating `Dataset` objects, we remove word embedding models to free memory. With `keep_embs=False` this operation is omitted, and you can use `.embs` attribute for share embedding models with other objects. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. *The model hyperparameters*: **rnn_emb_dim** (`int`; default is `None`): the internal character RNN (LSTM) embedding dimensionality. If `None`, the layer is skipped. **cnn_emb_dim** (`int`; default is `None`): the internal character CNN embedding dimensionality. If `None`, the layer is skipped. **cnn_kernels** (`list([int])`; default is `[1, 2, 3, 4, 5, 6]`): CNN kernel sizes of the internal CNN embedding layer. Relevant if **cnn_emb_dim** is not `None`. **emb_bn** (`bool`; default is `True`): whether batch normalization layer should be applied after the embedding concatenation. **emb_do** (`float`; default is `.2`): the dropout rate after the embedding concatenation. **final_emb_dim** (`int`; default is `512`): the output dimesionality of the linear transformation applying to concatenated embeddings. **pre_bn** (`bool`; default is `True`): whether batch normalization layer should be applied before the main part of the algorithm. **pre_do** (`float`; default is `.5`): the dropout rate before the main part of the algorithm. **lstm_layers** (`int`; default is `1`): the number of Bidirectional LSTM layers. If `None`, they are not created. **lstm_do** (`float`; default is `0`): the dropout between LSTM layers. Only relevant, if `lstm_layers` > `1`. **tran_layers** (`int`; default is `None`): the number of Transformer Encoder layers. If `None`, they are not created. **tran_heads** (`int`; default is `8`): the number of attention heads of Transformer Encoder layers. Only relevant, if `tran_layers` > `1`. **post_bn** (`bool`; default is `True`): whether batch normalization layer should be applied after the main part of the algorithm. **post_do** (`float`; default is `.4`): the dropout rate after the main part of the algorithm. The method returns the train statistics. """ if not start_time: start_time = time.time() args, kwargs = get_func_params(UposTagger.train, locals()) self._cdict = CorpusDict( corpus=(x for x in [self._train_corpus, self._test_corpus if self._test_corpus else []] for x in x), format='conllu_parsed', log_file=log_file ) return super().train(self._field, None, UposTaggerModel, None, *args, **kwargs)
def predict(self, corpus, use_cdict_coef=False, with_orig=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, save_to=None, log_file=LOG_FILE): """Predicts tags in the DEPREL field of the corpus. Args: **corpus**: the corpus which will be used for the feature extraction and predictions. May be either the name of the file in *CoNLL-U* format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*. **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`, we use our prediction only. If `True`, we replace our prediction to the value returned by the `corpuscula.CorpusDict.predict_<field>()` method if its `coef` >= `.99`. Also, you can specify your own threshold as the value of the param. **with_orig** (`bool`; default is `False`): if `True`, instead of just the sequence with predicted labels, return the sequence of tuples where the first element is the sentence with predicted labels and the second element is the original sentence. **with_orig** can be `True` only if **save_to** is `None`. **batch_size** (`int`; default is `64`): the number of sentences per batch. **split** (`int`; default is `None`): the number of lines in sentences split. Allows to process a large dataset in pieces ("splits"). If **split** is `None` (default), all the dataset is processed without splits. **clone_ds** (`bool`; default is `False`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **save_to** (`str`; default is `None`): the file name where the predictions will be saved. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. Returns the corpus with tags predicted in the DEPREL field. """ assert self._ds is not None, \ "ERROR: The tagger doesn't have a dataset. Call the train() " \ 'method first' assert not with_orig or save_to is None, \ 'ERROR: `with_orig` can be True only if save_to is None' args, kwargs = get_func_params(FeatTagger.predict, locals()) kwargs['save_to'] = None corpus2 = None if self._supp_tagger: kwargs2 = kwargs.copy() kwargs2['with_orig'] = True corpus2 = self._supp_tagger.predict(*args, **kwargs2) corpus = super().predict(*args, **kwargs) def add_root(corpus): for sent in corpus: sent0 = sent[0] if isinstance(sent, tuple) \ and not isinstance(sent[0], tuple) \ and not isinstance(sent[1], tuple) else \ sent if isinstance(sent0, tuple): sent0 = sent0[0] if corpus2: sent2 = next(corpus2)[1] if isinstance(sent2, tuple): sent2 = sent2[0] for i, tok in enumerate(sent0): if tok['HEAD'] == '0': tok['DEPREL'] = 'root' elif corpus2 and tok['DEPREL'] == 'root': tok['DEPREL'] = sent2[i]['DEPREL'] yield sent corpus = add_root(corpus) if save_to: self.save_conllu(corpus, save_to, log_file=None) corpus = self._get_corpus(save_to, asis=True, log_file=log_file) return corpus
def predict(self, corpus, with_orig=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, save_to=None, log_file=LOG_FILE): """Predicts tags in the DEPREL field of the corpus. Args: **corpus**: a corpus which will be used for feature extraction and predictions. May be either a name of the file in *CoNLL-U* format or a list/iterator of sentences in *Parsed CoNLL-U*. **with_orig** (`bool`): if `True`, instead of only a sequence with predicted labels, returns a sequence of tuples where the first element is a sentence with predicted labels and the second element is the original sentence. `with_orig` can be `True` only if `save_to` is `None`. Default `with_orig=False`. **batch_size** (`int`): number of sentences per batch. Default `batch_size=64`. **split** (`int`): number of lines in each split. Allows to process a large dataset in pieces ("splits"). Default `split=None`, i.e. process full dataset without splits. **clone_ds** (`bool`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **save_to**: file name where the predictions will be saved. **log_file**: a stream for info messages. Default is `sys.stdout`. Returns corpus with tags predicted in the DEPREL field. """ assert self._ds is not None, \ "ERROR: The tagger doesn't have a dataset. Call the train() " \ 'method first' assert self._model, \ "ERROR: The tagger doesn't have a model. Call the train() " \ 'method first' assert not with_orig or save_to is None, \ 'ERROR: `with_orig` can be True only if save_to is None' args, kwargs = get_func_params(DeprelTagger.predict, locals()) if self._feats_prune_coef != 0: key_vals = self._ds.get_dataset('t_0').transform_dict corpus = self._get_corpus(corpus, asis=True, log_file=log_file) corpus = self._transform_upos(corpus, key_vals=key_vals) field = 'DEPREL' add_fields = self._normalize_field_names('UPOS') def process(corpus): corpus = self._get_corpus(corpus, asis=True, log_file=log_file) device = next(self._model.parameters()).device or junky.CPU ds_y = self._ds.get_dataset('y') if clone_ds: ds = self._ds.clone() ds.remove('y') for start in itertools.count(step=split if split else 1): if isinstance(corpus, Iterator): if split: corpus_ = [] for i, sentence in enumerate(corpus, start=1): corpus_.append(sentence) if i == split: break else: corpus_ = list(corpus) else: if split: corpus_ = corpus[start:start + split] else: corpus_ = corpus if not corpus_: break orig_corpus = corpus_ corpus_, _, restore_data = self._preprocess_corpus(corpus_) res = \ junky.extract_conllu_fields( corpus_, fields=add_fields, with_empty=True, return_nones=True ) sentences, tags, empties, nones = \ res[0], res[1:-2], res[-2], res[-1] if clone_ds: self._transform(sentences, tags=tags, batch_size=batch_size, ds=ds, log_file=log_file) loader = ds.create_loader(batch_size=batch_size, shuffle=False) else: loader = self._transform_collate(sentences, tags=tags, batch_size=batch_size, log_file=log_file) preds = [] for batch in loader: batch = junky.to_device(batch, device) with torch.no_grad(): pred = self._model(*batch) pred_indices = pred.argmax(-1) preds.extend(pred_indices.cpu().numpy().tolist()) values = ds_y.reconstruct(preds) if with_orig: res_corpus_ = deepcopy(orig_corpus) res_corpus_ = \ self._postprocess_corpus(res_corpus_, values, restore_data) for orig_sentence, sentence in zip(orig_corpus, res_corpus_): yield sentence, orig_sentence else: res_corpus_ = \ self._postprocess_corpus(orig_corpus, values, restore_data) for sentence in res_corpus_: yield sentence corpus = process(corpus) if self._feats_prune_coef != 0: corpus = self._restore_upos(corpus) if save_to: self.save_conllu(corpus, save_to, log_file=None) corpus = self._get_corpus(save_to, asis=True, log_file=log_file) return corpus
def predict(self, corpus, use_cdict_coef=False, with_orig=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, save_to=None, log_file=LOG_FILE, **_): """Predicts values in the certain feature of the key-value type field of the specified **corpus**. Args: **corpus**: a corpus which will be used for feature extraction and predictions. May be either a name of the file in *CoNLL-U* format or a list/iterator of sentences in *Parsed CoNLL-U*. **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`, we use our prediction only. If `True`, we replace our prediction to the value returned by the `corpuscula.CorpusDict.predict_<field>()` method if its `coef` >= `.99`. Also, you can specify your own threshold as the value of the param. **with_orig** (`bool`): if `True`, instead of only a sequence with predicted labels, returns a sequence of tuples where the first element is a sentence with predicted labels and the second element is the original sentence. `with_orig` can be `True` only if `save_to` is `None`. Default `with_orig=False`. **batch_size** (`int`): number of sentences per batch. Default `batch_size=64`. **split** (`int`): number of lines in each split. Allows to process a large dataset in pieces ("splits"). Default `split=None`, i.e. process full dataset without splits. **clone_ds** (`bool`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **save_to**: file name where the predictions will be saved. **log_file**: a stream for info messages. Default is `sys.stdout`. Returns the corpus with predicted values of certain feature in the key-value type field. """ assert self._ds is not None, \ "ERROR: The tagger doesn't have a dataset. Call the train() " \ 'method first' assert not with_orig or save_to is None, \ 'ERROR: `with_orig` can be True only if save_to is None' args, kwargs = get_func_params(FeatTagger.predict, locals()) if self._feats_prune_coef != 0: kwargs['save_to'] = None key_vals = self._ds.get_dataset('t_0').transform_dict corpus = self._get_corpus(corpus, asis=True, log_file=log_file) corpus = self._transform_upos(corpus, key_vals=key_vals) corpus = super().predict(self._field, 'UPOS', corpus, **kwargs) if self._feats_prune_coef != 0: corpus = self._restore_upos(corpus) if save_to: self.save_conllu(corpus, save_to, log_file=None) corpus = self._get_corpus(save_to, asis=True, log_file=log_file) return corpus
def __init__(self, num_labels, labels_pad_idx=-100, vec_emb_dim=None, alphabet_size=0, char_pad_idx=0, rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=[1, 2, 3, 4, 5, 6], tag_emb_params=None, emb_bn=True, emb_do=.2, final_emb_dim=512, pre_bn=True, pre_do=.5, lstm_layers=1, lstm_do=0, tran_layers=None, tran_heads=8, post_bn=True, post_do=.4): if isinstance(cnn_kernels, Iterable): cnn_kernels = list(cnn_kernels) args, kwargs = get_func_params(BaseTaggerModel.__init__, locals()) super().__init__(*args, **kwargs) assert final_emb_dim % 2 == 0, \ 'ERROR: `final_emb_dim` must be even ' \ f"(now it's `{final_emb_dim}`)." self.num_labels = num_labels self.labels_pad_idx = labels_pad_idx if vec_emb_dim is None: vec_emb_dim = 0 self.vec_emb_dim = vec_emb_dim if rnn_emb_dim: self.rnn_emb_l = \ CharEmbeddingRNN(alphabet_size=alphabet_size, emb_dim=rnn_emb_dim, pad_idx=char_pad_idx) else: self.rnn_emb_l = None rnn_emb_dim = 0 if cnn_emb_dim: self.cnn_emb_l = \ CharEmbeddingCNN(alphabet_size=alphabet_size, emb_dim=cnn_emb_dim, pad_idx=char_pad_idx, kernels=cnn_kernels) else: self.cnn_emb_l = None cnn_emb_dim = 0 self.tag_emb_l = self.tag_emb_ls = None tag_emb_dim = 0 if tag_emb_params: if isinstance(tag_emb_params, dict): tag_emb_dim = tag_emb_params['dim'] or 0 if tag_emb_dim: self.tag_emb_l = \ nn.Embedding(tag_emb_params['num'], tag_emb_dim, padding_idx=tag_emb_params['pad_idx']) else: self.tag_emb_ls = nn.ModuleList() for emb_params in tag_emb_params: tag_emb_dim_ = emb_params['dim'] if tag_emb_dim_: tag_emb_dim += tag_emb_dim_ self.tag_emb_ls.append( nn.Embedding(emb_params['num'], tag_emb_dim_, padding_idx=emb_params['pad_idx'])) else: self.tag_emb_ls.append(None) joint_emb_dim = vec_emb_dim + rnn_emb_dim + cnn_emb_dim + tag_emb_dim assert joint_emb_dim, \ 'ERROR: At least one of `*_emb_dim` must be specified.' # PREPROCESS ######################################################### modules = OrderedDict() if emb_bn: modules['emb_bn'] = BatchNorm(num_features=joint_emb_dim) if emb_do: modules['emb_do'] = nn.Dropout(p=emb_do) layers = [] def add_layers(dim, new_dim): ls = [] ls.append( ('pre_fc{}_l', nn.Linear(in_features=new_dim, out_features=dim))) if pre_bn: ls.append(('pre_bn{}', BatchNorm(num_features=dim))) ls.append(('pre_nl{}', nn.ReLU())) if pre_do: ls.append(('pre_do{}', nn.Dropout(p=pre_do))) layers.append(ls) dim = final_emb_dim while joint_emb_dim / dim > 2: new_dim = int(dim * 1.5) add_layers(dim, new_dim) dim = new_dim add_layers(dim, joint_emb_dim) for idx, layer in enumerate(reversed(layers)): for name, module in layer: modules[name.format(idx)] = module #modules['pre_fc_l'] = nn.Linear(in_features=dim, # out_features=final_emb_dim) #if pre_bn: # modules['pre_bn'] = BatchNorm(num_features=final_emb_dim) #modules['pre_nl'] = nn.ReLU() #if pre_do: # modules['pre_do'] = nn.Dropout(p=pre_do) self.pre_seq_l = nn.Sequential(modules) ###################################################################### if lstm_layers: self.lstm_l = nn.LSTM(input_size=final_emb_dim, hidden_size=final_emb_dim // 2, num_layers=lstm_layers, batch_first=True, dropout=lstm_do, bidirectional=True) self.T = nn.Linear(final_emb_dim, final_emb_dim) nn.init.constant_(self.T.bias, -1) else: self.lstm_l = None if tran_layers: tran_enc_l = nn.TransformerEncoderLayer( final_emb_dim, tran_heads, dim_feedforward=2048, dropout=0.1, activation='relu' #, layer_norm_eps=1e-5 ) tran_norm_l = nn.LayerNorm(normalized_shape=final_emb_dim, eps=1e-6, elementwise_affine=True) self.tran_l = nn.TransformerEncoder(tran_enc_l, tran_layers, norm=tran_norm_l) else: self.tran_l = None # POSTPROCESS ######################################################## modules = OrderedDict() if post_bn: modules['post_bn'] = BatchNorm(num_features=final_emb_dim) if post_do: modules['post_do'] = nn.Dropout(p=post_do) modules['out_fc_l'] = nn.Linear(in_features=final_emb_dim, out_features=num_labels) self.post_seq_l = nn.Sequential(modules) ###################################################################### setattr(self, CONFIG_ATTR, (args, kwargs))
def predict(self, corpus, use_cdict_coef=False, with_orig=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, save_to=None, log_file=LOG_FILE): """Predicts tags in the LEMMA field of the corpus. Args: **corpus**: the corpus which will be used for the feature extraction and predictions. May be either the name of the file in *CoNLL-U* format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*. **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`, we use our prediction only. If `True`, we replace our prediction to the value returned by the `corpuscula.CorpusDict.predict_<field>()` method if its `coef` >= `.99`. Also, you can specify your own threshold as the value of the param. **with_orig** (`bool`; default is `False`): if `True`, instead of just the sequence with predicted labels, return the sequence of tuples where the first element is the sentence with predicted labels and the second element is the original sentence. **with_orig** can be `True` only if **save_to** is `None`. **batch_size** (`int`; default is `64`): the number of sentences per batch. **split** (`int`; default is `None`): the number of lines in sentences split. Allows to process a large dataset in pieces ("splits"). If **split** is `None` (default), all the dataset is processed without splits. **clone_ds** (`bool`; default is `False`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **save_to** (`str`; default is `None`): the file name where the predictions will be saved. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. Returns the corpus with lemmata predicted. """ assert self._ds is not None, \ "ERROR: The tagger doesn't have a dataset. Call the train() " \ 'method first' assert not with_orig or save_to is None, \ 'ERROR: `with_orig` can be True only if save_to is None' args, kwargs = get_func_params(LemmaTagger.predict, locals()) del kwargs['use_cdict_coef'] kwargs['save_to'] = None cdict = self._cdict yof = len([x for x in cdict._wforms if 'ё' in x or 'Ё' in x]) yol = len([x for x in cdict._lemmata if 'ё' in x or 'Ё' in x]) remove_yo = yol / yof < 10 def apply_editops(str_from, upos, ops_t, isfirst): if str_from and ops_t not in [None, (None,)]: str_from_ = None if use_cdict_coef not in [None, False]: str_from_, coef = \ cdict.predict_lemma(str_from, upos, isfirst=isfirst) if coef is not None \ and coef >= (CDICT_COEF_THRESH \ if use_cdict_coef is True else \ use_cdict_coef): str_from = str_from_ else: str_from_ = None if str_from_ is None: try: ops_p, ops_s, ops_c = ops_t str_from_ = ''.join(reversed( self.apply_editops(reversed( self.apply_editops(str_from, ops_p) ), ops_s) )) if str_from_: str_from = str_from_ if 'ё' in str_from or 'Ё' in str_from: if not cdict.lemma_isknown(str_from, upos): str_from_ = str_from.replace('ё', 'е') \ .replace('Ё', 'Е') if remove_yo \ or cdict.lemma_isknown(str_from_, upos): str_from = str_from_ except IndexError: pass if ops_c == _OP_C_LOWER: str_from = str_from.lower() elif ops_c == _OP_C_TITLE: str_from = str_from.title() return str_from def process(corpus): for sentence in corpus: sentence_ = sentence[0] if with_orig else sentence if isinstance(sentence_, tuple): sentence_ = sentence_[0] isfirst = True for token in sentence_: id_, form = token['ID'], token['FORM'] if form and '-' not in id_: token[self._field] = \ apply_editops(form, token['UPOS'], token[self._field], isfirst=isfirst) isfirst = False yield sentence key_vals = self._ds.get_dataset('t_0').transform_dict corpus = self._get_corpus(corpus, asis=True, log_file=log_file) corpus = self._transform_upos(corpus, key_vals=key_vals) corpus = super().predict(self._field, 'UPOS', corpus, **kwargs) corpus = self._restore_upos(corpus) corpus = process(corpus) if save_to: self.save_conllu(corpus, save_to, log_file=None) corpus = self._get_corpus(save_to, asis=True, log_file=log_file) return corpus
def train(self, save_as, feats=None, device=None, control_metric='accuracy', max_epochs=None, min_epochs=0, bad_epochs=5, batch_size=TRAIN_BATCH_SIZE, max_grad_norm=None, tags_to_remove=None, word_emb_type='bert', word_emb_path=None, word_transform_kwargs=None, # BertDataset.transform() (for BERT-descendant models) # params: # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10', # 'aggregate_hiddens_op': 'cat', # 'aggregate_subtokens_op': 'absmax', 'to': junky.CPU, # 'loglevel': 1} # WordDataset.transform() (for other models) params: # {'check_lower': True} stage1_params=None, # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8, # 'weight_decay': 0, 'amsgrad': False, # 'max_epochs': None, 'min_epochs': None, # 'bad_epochs': None, 'batch_size': None, # 'max_grad_norm': None} stage2_params=None, # {'lr': .001, 'momentum': .9, 'weight_decay': 0, # 'dampening': 0, 'nesterov': False, # 'max_epochs': None, 'min_epochs': None, # 'bad_epochs': None, 'batch_size': None, # 'max_grad_norm': None} stage3_params={'save_as': None}, # {'save_as': None, 'epochs': 3, 'batch_size': 8, # 'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8, # 'weight_decay': .01, 'amsgrad': False, # 'num_warmup_steps': 3, 'max_grad_norm': 1.} stages=[1, 2, 3, 1, 2], save_stages=False, learn_on_padding=True, remove_padding_intent=False, seed=None, keep_embs=False, log_file=LOG_FILE, rnn_emb_dim=None, cnn_emb_dim=200, cnn_kernels=range(1, 7), upos_emb_dim=200, emb_bn=True, emb_do=.2, final_emb_dim=512, pre_bn=True, pre_do=.5, lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8, post_bn=True, post_do=.4): """Creates and trains the separate feature tagger model. *Training's args*: **save_as** (`str`): the name using for save the model's head. Refer to the `.save()` method's help for the broad definition (see the **name** arg there). **feats** (`str | list([str])`; default is `None`): one or several subfields of the key-value type fields like `FEATS` or `MISC` to be predicted separatedly. **device** (`str`; default is `None`): the device for the model. E.g.: 'cuda:0'. If `None`, we don't move the model to any device (it is placed right where it's created). **control_metric** (`str`; default is `accuracy`): the metric that control training. Any that is supported by the `junky.train()` method. In the moment, it is: 'accuracy', 'f1', 'loss', 'precision', and 'recall'. **max_epochs** (`int`; default is `None`): the maximal number of epochs for the model's head training (stages types `1` and `2`). If `None` (default), the training would be linger until **bad_epochs** has met, but no less than **min_epochs**. **min_epochs** (`int`; default is `0`): the minimal number of training epochs for the model's head training (stages types `1` and `2`). **bad_epochs** (`int`; default is `5`): the maximal allowed number of bad epochs (epochs when chosen **control_metric** is not became better) in a row for the model's head training (stages types `1` and `2`). **batch_size** (`int`; default is `32`): the number of sentences per batch for the model's head training (stages types `1` and `2`). **max_grad_norm** (`float`; default is `None`): the gradient clipping parameter for the model's head training (stages types `1` and `2`). **tags_to_remove** (`dict({str: str}) | dict({str: list([str])})`; default is `None`): the tags, tokens with those must be removed from the corpus. It's the `dict` with field names as keys and values you want to remove. Applied only to fields with atomic values (like *UPOS*). This argument may be used, for example, to remove some infrequent or just excess tags from the corpus. Note, that we remove the tokens from the train corpus completely, not just replace those tags to `None`. *Word embedding params*: **word_emb_type** (`str`; default is `'bert'`): one of (`'bert'` | `'glove'` | `'ft'` | `'w2v'`) embedding types. **word_emb_path** (`str`): the path to the word embeddings storage. **word_transform_kwargs** (`dict`; default is `None`): keyword arguments for the `.transform()` method of the dataset created for sentences to word embeddings conversion. See the `.transform()` method of either `junky.datasets.BertDataset` (if **word_emb_path** is `'bert'`) or `junky.datasets.WordDataset` (otherwise) if you want to learn allowed values for the parameter. If `None`, the `.transform()` method use its defaults. *Training stages params*: **stage1_param** (`dict`; default is `None`): keyword arguments for the `BaseModel.adjust_model_for_train()` method. If `None`, the defaults are used. Also, you can specify here new values for the arguments **max_epochs**, **min_epochs**, **bad_epochs**, **batch_size**, and **max_grad_norm** that will be used only on stages of type `1`. **stage2_param** (`dict`; default is `None`): keyword arguments for the `BaseModel.adjust_model_for_tune()` method. If `None`, the defaults are used. Also, you can specify here new values for the arguments **max_epochs**, **min_epochs**, **bad_epochs**, **batch_size**, and **max_grad_norm** that will be used only on stages of type `2`. **stage3_param** (`dict`; default is `None`): keyword arguments for the `WordEmbeddings.full_tune()` method. If `None`, the defaults are used. **stages** (`list([int]`; default is `[1, 2, 3, 1, 2]`): what stages we should use during training and in which order. On the stage type `1` the model head is trained with *Adam* optimizer; the stage type `2` is similar, but the optimizer is *SGD*; the stage type `3` is only relevant when **word_emb_type** is `'bert'` and we want to tune the whole model. Stage type `0` defines the skip-stage, i.e. there would be no real training on it. It is used when you need reproducibility and want to continue train the model from some particular stage. In this case, you specify the name of the model saved on that stage in the parametere **load_from**, and put zeros into the **stages** list on the places of already finished ones. One more time: it is used for reproducibility only, i.e. when you put some particular value to the **seed** param and want the data order in bathes be equivalent with data on the stages from the past trainings. **save_stages** (`bool`; default is `False`): if we need to keep the best model of each stage beside of the overall best model. The names of these models would have the suffix `_<idx>(stage<stage_type>)` where `<idx>` is an ordinal number of the stage. We can then use it to continue training from any particular stage number (changing next stages or their parameters) using the parameter **load_from** (with separate tagger the tuning is more complicated: one have to load and tune each model separately as standalone instance of the `FeatTagger` class; the `FeatsSeparateTagger.predict()` method doesn't even have **load_from** argument). Note that we save only stages of the head model. The embedding model as a part of the full model usually tune only once, so we don't make its copy. *Other options*: **learn_on_padding** (`bool`; default is `True`): while training, we can calculate loss either taking in account predictions made for padding tokens or without it. The common practice is don't use padding when calculate loss. However, we note that using padding sometimes makes the resulting model performance slightly better. **remove_padding_intent** (`bool`; default is `False`): if you set **learn_on_padding** param to `False`, you may want not to use padding intent during training at all. I.e. padding tokens would be tagged with some of real tags, and they would just ignored during computing loss. As a result, the model would have the output dimensionality of the final layer less by one. On the first sight, such approach could increase the performance, but in our experiments, such effect appeared not always. **seed** (`int`; default is `None`): init value for the random number generator if you need reproducibility. Note that each stage will have its own seed value, and the **seed** param is used to calculate these values. **keep_embs** (`bool`; default is `False`): by default, after creating `Dataset` objects, we remove word embedding models to free memory. With `keep_embs=False` this operation is omitted, and you can use `.embs` attribute for share embedding models with other objects. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. *The model hyperparameters*: **rnn_emb_dim** (`int`; default is `None`): the internal character RNN (LSTM) embedding dimensionality. If `None`, the layer is skipped. **cnn_emb_dim** (`int`; default is `200`): the internal character CNN embedding dimensionality. If `None`, the layer is skipped. **cnn_kernels** (`list([int])`; default is `[1, 2, 3, 4, 5, 6]`): CNN kernel sizes of the internal CNN embedding layer. Relevant if **cnn_emb_dim** is not `None`. **upos_emb_dim** (`int`; default is `200`): the auxiliary UPOS label embedding dimensionality. **emb_bn** (`bool`; default is 'True'): whether batch normalization layer should be applied after the embedding concatenation. **emb_do** (`float`; default is '.2'): the dropout rate after the embedding concatenation. **final_emb_dim** (`int`; default is `512`): the output dimesionality of the linear transformation applying to concatenated embeddings. **pre_bn** (`bool`; default is 'True'): whether batch normalization layer should be applied before the main part of the algorithm. **pre_do** (`float`; default is '.5'): the dropout rate before the main part of the algorithm. **lstm_layers** (`int`; default is `1`): the number of Bidirectional LSTM layers. If `None`, they are not created. **lstm_do** (`float`; default is `0`): the dropout between LSTM layers. Only relevant, if `lstm_layers` > `1`. **tran_layers** (`int`; default is `None`): the number of Transformer Encoder layers. If `None`, they are not created. **tran_heads** (`int`; default is `8`): the number of attention heads of Transformer Encoder layers. Only relevant, if `tran_layers` > `1`. **post_bn** (`bool`; default is 'True'): whether batch normalization layer should be applied after the main part of the algorithm. **post_do** (`float`; default is '.4'): the dropout rate after the main part of the algorithm. The method returns the train statistics. """ assert self._train_corpus, 'ERROR: Train corpus is not loaded yet' start_time = time.time() args, kwargs = get_func_params(FeatsSeparateTagger.train, locals()) del kwargs['feats'] if 'stage3_params' in kwargs and 'save_as' in kwargs['stage3_params']: word_emb_path_suffix = kwargs['stage3_params']['save_as'] del kwargs['stage3_params']['save_as'] else: word_emb_path_suffix = None if log_file: print('###### {} TAGGER TRAINING PIPELINE ######' .format(self._field), file=log_file) print("\nWe're gonna train separate models for {} {} in train " .format('the requested' if feats else 'all', self._field) + 'corpus. Feats are:\n', file=log_file) if not feats: feats = sorted(set(x for x in self._train_corpus for x in x for x in x[self._field].keys())) elif isinstance(feats, str): feats = [feats] if log_file: print(', '.join(feats), file=log_file) res = {} for feat in feats: start_time_ = time.time() if log_file: print(file=log_file) clear_tqdm() save_as_ = '{}-{}'.format(save_as, feat.lower()) self._feats[feat] = save_as_ tagger = FeatTagger(self._field + ':' + feat, feats_prune_coef=self._feats_prune_coef, embs=self.embs) tagger._train_corpus, tagger._test_corpus = \ self._train_corpus, self._test_corpus if word_emb_path_suffix: kwargs['stage3_params']['save_as'] = \ '{}-{}_{}'.format(self._field.lower(), feat.lower(), word_emb_path_suffix) res[feat] = tagger.train(save_as_, **kwargs, start_time=start_time_) del tagger self.save(save_as, log_file=log_file) if log_file: print('\n###### {} TAGGER TRAINING HAS FINISHED ### ' .format(self._field) + 'Total time: {} ######\n' .format(seconds_to_strtime(time.time() - start_time)), file=log_file) print(("Now, check the separate {} models' and datasets' " 'config files and consider to change some device names ' 'to be able to load all the models jointly. You can find ' 'the separate models\' list in the "{}" config file. ' "Then, use the `.load('{}')` method to start working " 'with the {} tagger.') .format(self._field, save_as + CONFIG_EXT, save_as, self._field), file=log_file) return res
def predict(self, corpus, use_cdict_coef=False, with_orig=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, save_to=None, log_file=LOG_FILE, **_): """Predicts feature keys and values in the key-value type field of the corpus. Args: **corpus**: the corpus which will be used for the feature extraction and predictions. May be either the name of the file in *CoNLL-U* format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*. **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`, we use our prediction only. If `True`, we replace our prediction to the value returned by the `corpuscula.CorpusDict.predict_<field>()` method if its `coef` >= `.99`. Also, you can specify your own threshold as the value of the param. **with_orig** (`bool`; default is `False`): if `True`, instead of just the sequence with predicted labels, return the sequence of tuples where the first element is the sentence with predicted labels and the second element is the original sentence. **with_orig** can be `True` only if **save_to** is `None`. **batch_size** (`int`; default is `64`): the number of sentences per batch. **split** (`int`; default is `None`): the number of lines in sentences split. Allows to process a large dataset in pieces ("splits"). If **split** is `None` (default), all the dataset is processed without splits. **clone_ds** (`bool`; default is `False`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **save_to** (`str`; default is `None`): the file name where the predictions will be saved. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. Returns the corpus with feature keys and values predicted in the FEATS field. """ assert not with_orig or save_to is None, \ 'ERROR: `with_orig` can be True only if save_to is None' args, kwargs = get_func_params(FeatsJointTagger.predict, locals()) kwargs['save_to'] = None def process(corpus): for sentence in corpus: sentence_ = sentence[0] if with_orig else sentence if isinstance(sentence_, tuple): sentence_ = sentence_[0] for token in sentence_: token[self._field] = OrderedDict( [(x, y) for x, y in [ x.split('=') for x in token[self._field].split('|') ]] ) if token[self._field] else OrderedDict() yield sentence corpus = process( super().predict(self._field, 'UPOS', *args, **kwargs) ) if save_to: self.save_conllu(corpus, save_to, log_file=None) corpus = self._get_corpus(save_to, asis=True, log_file=log_file) return corpus
def predict(self, corpus, feats=None, remove_excess_feats=True, use_cdict_coef=False, with_orig=False, batch_size=BATCH_SIZE, split=None, clone_ds=False, save_to=None, log_file=LOG_FILE): """Predicts feature keys and values in the FEATS field of the corpus. Args: **corpus**: the corpus which will be used for the feature extraction and predictions. May be either the name of the file in *CoNLL-U* format or the `list`/`iterator` of sentences in *Parsed CoNLL-U*. **feats** (`str | list([str])`; default is `None`): one or several subfields of the key-value type fields like `FEATS` or `MISC` to be predicted separatedly. **remove_excess_feats** (`bool`): if `True` (default), the tagger removes all unrelevant features from the predicted field ("unrelevant" means, that the tagger don't have a models for them). For example, if you trained the tagger only for "Case" and "Gender" features, the tagger predicts only them (or, only one of them, if you specify it in the **feats** field) and removes all the rest. Elsewise, if **remove_excess_feats** is `False`, all unrelevant feats are stayed intact. **use_cdict_coef** (`bool` | `float`; default is `False`): if `False`, we use our prediction only. If `True`, we replace our prediction to the value returned by the `corpuscula.CorpusDict.predict_<field>()` method if its `coef` >= `.99`. Also, you can specify your own threshold as the value of the param. **with_orig** (`bool`; default is `False`): if `True`, instead of just the sequence with predicted labels, return the sequence of tuples where the first element is the sentence with predicted labels and the second element is the original sentence. **with_orig** can be `True` only if **save_to** is `None`. **batch_size** (`int`; default is `64`): the number of sentences per batch. **split** (`int`; default is `None`): the number of lines in sentences split. Allows to process a large dataset in pieces ("splits"). If **split** is `None` (default), all the dataset is processed without splits. **clone_ds** (`bool`; default is `False`): if `True`, the dataset is cloned and transformed. If `False`, `transform_collate` is used without cloning the dataset. There is no big differences between the variants. Both should produce identical results. **save_to** (`str`; default is `None`): the file name where the predictions will be saved. **log_file** (`file`; default is `sys.stdout`): the stream for info messages. Returns the corpus with feature keys and values predicted in the FEATS field. """ args, kwargs = get_func_params(FeatsSeparateTagger.predict, locals()) del kwargs['feats'] del kwargs['remove_excess_feats'] if feats is None: feats = self._feats else: if isinstance(feats, str): feats = [feats] unknown_feats = [] for feat in sorted(feats): if feat not in self._feats: unknown_feats.append(feat) assert unknown_feats, \ 'ERROR: feats {} are unknown for the tagger' \ .format(unknown_feats) kwargs['with_orig'] = False kwargs['save_to'] = None def process(corpus): corpus = self._get_corpus(corpus, asis=True, log_file=log_file) for start in itertools.count(step=split if split else 1): if isinstance(corpus, Iterator): if split: corpus_ = [] for i, sentence in enumerate(corpus, start=1): corpus_.append(sentence) if i == split: break else: corpus_ = list(corpus) else: if split: corpus_ = corpus[start:start + split] else: corpus_ = corpus if not corpus_: break if remove_excess_feats: for sentence in corpus_: for token in sentence[0] \ if isinstance(sentence, tuple) else \ sentence: token[self._field] = OrderedDict( (x, y) for x, y in token[self._field].items() if x in feats ) res_corpus_ = deepcopy(corpus_) if with_orig else corpus_ for attrs in self._feats.values(): tagger = attrs[1] \ if isinstance(attrs, list) else \ None assert isinstance(tagger, FeatTagger), \ 'ERROR: Model is not loaded. Use the .load() ' \ 'method prior' res_corpus_ = tagger.predict(res_corpus_, **kwargs) if with_orig: for orig_sentence, sentence in zip(corpus_, res_corpus_): yield sentence, orig_sentence else: for sentence in res_corpus_: yield sentence corpus = process(corpus) if save_to: self.save_conllu(corpus, save_to, log_file=None) corpus = self._get_corpus(save_to, asis=True, log_file=log_file) return corpus