Ejemplos de CorpusDict en Python

Lenguaje de programación: Python

Namespace/Package Name: corpuscula

Clase / Tipo: CorpusDict

Ejemplos en hotexamples.com: 5

Python CorpusDict - 5 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de corpuscula.CorpusDict extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

CorpusDict(4)

backup(1)

backup_to(1)

restore(1)

restore_from(1)

Ejemplo n.º 1

Mostrar archivo

    def load_train_corpus(self,
                          corpus,
                          append=False,
                          parse=False,
                          test=None,
                          seed=None):
        """Load train corpus and (possibly) create a CorpusDict for it

        :param corpus: a name of file in CoNLL-U format or list/iterator of
                       sentences in Parsed CoNLL-U
        :param append: add corpus to already loaded one(s)
        :param parse: extract corpus statistics to CorpusDict right after
                      loading
        :param test: if not None, then train corpus will be shuffled and
                     specified part of it stored as test corpus
        :type test: float
        :param seed: init value for the random number generator. Used only 
                     if test is not None
        :type seed: int
        """
        assert append or not self._train_corpus, \
               'ERROR: Train corpus is already loaded. Use append=True to ' \
               'append one more corpus'

        print('Train:', end=' ', file=LOG_FILE)
        if (isinstance(corpus, type) and issubclass(corpus, _AbstractCorpus)) \
        or isinstance(corpus, _AbstractCorpus):
            corpus = corpus.train()
        corpus = self._get_corpus(corpus)
        if test is not None:
            corpus = list(corpus)
            assert test >= 0 and test <= 1, \
                   'ERROR: A value of "test" parameter must be between 0 and 1'
            test_len = round(len(corpus) * test)
            if test_len == 0:
                print('WARNING: All the corpus given will be saved as train ' \
                      'corpus (parameter "test" is too small)', file=LOG_FILE)
            else:
                if test_len == len(corpus):
                    print('WARNING: All the corpus given will be saved as ' \
                          'test corpus (parameter "test" is too large)',
                          file=LOG_FILE)
                    self.load_test_corpus(corpus, append=append)
                    corpus = None
                else:
                    random.seed(seed)
                    random.shuffle(corpus)
                    self.load_test_corpus(corpus[:test_len], append=append)
                    corpus = corpus[test_len:]
                print('stored.', file=LOG_FILE)
        if corpus:
            if append == False:
                self._train_corpus = []
            self._train_corpus.extend(corpus)
            if parse:
                self._cdict = CorpusDict(corpus=self._train_corpus,
                                         format='conllu_parsed',
                                         log_file=LOG_FILE)

Ejemplo n.º 2

Mostrar archivo

    def parse_train_corpus(self, cnt_thresh=None, ambiguity_thresh=None):
        """Create a CorpusDict for train corpus(es) loaded. For one instance it
        may be used only once. Use ``load_train_corpus()`` with append=True
        to append one more corpus to the CorpusDict after it's created.

        :type cnt_thresh: int
        :type ambiguity_thresh: float
        Params for ``CorpusDict.fit()`` method. If omitted then default values
        will be used
        """
        assert self._train_corpus, 'ERROR: Train corpus is not loaded yet'

        print('Train:', end=' ', file=LOG_FILE)
        kwargs = {}
        if cnt_thresh is not None:
            kwargs['cnt_thresh'] = cnt_thresh
        if ambiguity_thresh is not None:
            kwargs['ambiguity_thresh'] = ambiguity_thresh
        self._cdict = CorpusDict(corpus=self._train_corpus,
                                 format='conllu_parsed',
                                 **kwargs,
                                 log_file=LOG_FILE)

Ejemplo n.º 3

Mostrar archivo

    def train(self, save_as,
              device=None, control_metric='accuracy', max_epochs=None,
              min_epochs=0, bad_epochs=5, batch_size=TRAIN_BATCH_SIZE,
              max_grad_norm=None, tags_to_remove=None, word_emb_type='bert',
              word_emb_path='xlm-roberta-base', word_transform_kwargs=None,
                  # BertDataset.transform() (for BERT-descendant models)
                  # params:
                  # {'max_len': 0, 'batch_size': 64, 'hidden_ids': '10',
                  #  'aggregate_hiddens_op': 'cat',
                  #  'aggregate_subtokens_op': 'absmax', 'to': junky.CPU,
                  #  'loglevel': 1}
                  # WordDataset.transform() (for other models) params:
                  # {'check_lower': True}
              stage1_params=None,
                  # {'lr': .0001, 'betas': (0.9, 0.999), 'eps': 1e-8,
                  #  'weight_decay': 0, 'amsgrad': False,
                  #  'max_epochs': None, 'min_epochs': None,
                  #  'bad_epochs': None, 'batch_size': None,
                  #  'max_grad_norm': None}
              stage2_params=None,
                  # {'lr': .001, 'momentum': .9, 'weight_decay': 0,
                  #  'dampening': 0, 'nesterov': False,
                  #  'max_epochs': None, 'min_epochs': None,
                  #  'bad_epochs': None, 'batch_size': None,
                  #  'max_grad_norm': None}
              stage3_params={'save_as': None},
                  # {'save_as': None, 'epochs': 3, 'batch_size': 8,
                  #  'lr': 2e-5, 'betas': (0.9, 0.999), 'eps': 1e-8,
                  #  'weight_decay': .01, 'amsgrad': False,
                  #  'num_warmup_steps': 3, 'max_grad_norm': 1.}
              stages=[1, 2, 3, 1, 2], save_stages=False, load_from=None,
              learn_on_padding=True, remove_padding_intent=False,
              seed=None, start_time=None, keep_embs=False, log_file=LOG_FILE,
              rnn_emb_dim=None, cnn_emb_dim=None, cnn_kernels=range(1, 7),
              emb_bn=True, emb_do=.2,
              final_emb_dim=512, pre_bn=True, pre_do=.5,
              lstm_layers=1, lstm_do=0, tran_layers=0, tran_heads=8,
              post_bn=True, post_do=.4):
        """Creates and trains the UPOS tagger model.

        During training, the best model is saved after each successful epoch.

        *Training's args*:

        **save_as** (`str`): the name using for save the model's head. Refer
        to the `.save()` method's help for the broad definition (see the
        **name** arg there).

        **device** (`str`, default is `None`): the device for the model. E.g.:
        'cuda:0'. If `None`, we don't move the model to any device (it is
        placed right where it's created).

        **control_metric** (`str`; default is `accuracy`): the metric that
        control training. Any that is supported by the `junky.train()` method.
        In the moment, it is: 'accuracy', 'f1', 'loss', 'precision', and
        'recall'.

        **max_epochs** (`int`; default is `None`): the maximal number of
        epochs for the model's head training (stages types `1` and `2`). If
        `None` (default), the training would be linger until **bad_epochs**
        has met, but no less than **min_epochs**.

        **min_epochs** (`int`; default is `0`): the minimal number of training
        epochs for the model's head training (stages types `1` and `2`).

        **bad_epochs** (`int`; default is `5`): the maximal allowed number of
        bad epochs (epochs when chosen **control_metric** is not became
        better) in a row for the model's head training (stages types `1` and
        `2`).

        **batch_size** (`int`; default is `32`): the number of sentences per
        batch for the model's head training (stages types `1` and `2`).

        **max_grad_norm** (`float`; default is `None`): the gradient clipping
        parameter for the model's head training (stages types `1` and `2`).

        **tags_to_remove** (`dict({str: str}) | dict({str: list([str])})`;
        default is `None`): the tags, tokens with those must be removed from
        the corpus. It's the `dict` with field names as keys and values you
        want to remove. Applied only to fields with atomic values (like
        UPOS). This argument may be used, for example, to remove some
        infrequent or just excess tags from the corpus. Note, that we remove
        the tokens from the train corpus completely, not just replace those
        tags to `None`.

        *Word embedding params*:

        **word_emb_type** (`str`; default is `'bert'`): one of (`'bert'` |
        `'glove'` | `'ft'` | `'w2v'`) embedding types.

        **word_emb_path** (`str`): the path to the word embeddings storage.

        **word_transform_kwargs** (`dict`; default is `None`): keyword
        arguments for the `.transform()` method of the dataset created for
        sentences to word embeddings conversion. See the `.transform()` method
        of either `junky.datasets.BertDataset` (if **word_emb_path** is
        `'bert'`) or `junky.datasets.WordDataset` (otherwise) if you want to
        learn allowed values for the parameter. If `None`, the `.transform()`
        method use its defaults.

        *Training stages params*:

        **stage1_param** (`dict`; default is `None`): keyword arguments for
        the `BaseModel.adjust_model_for_train()` method. If `None`, the
        defaults are used. Also, you can specify here new values for the
        arguments **max_epochs**, **min_epochs**, **bad_epochs**,
        **batch_size**, and **max_grad_norm** that will be used only on stages
        of type `1`.

        **stage2_param** (`dict`; default is `None`): keyword arguments for
        the `BaseModel.adjust_model_for_tune()` method. If `None`, the
        defaults are used. Also, you can specify here new values for the
        arguments **max_epochs**, **min_epochs**, **bad_epochs**,
        **batch_size**, and **max_grad_norm** that will be used only on stages
        of type `2`.

        **stage3_param** (`dict`; default is `None`): keyword arguments for
        the `WordEmbeddings.full_tune()` method. If `None`, the defaults are
        used.

        **stages** (`list([int]`; default is `[1, 2, 3, 1, 2]`): what stages
        we should use during training and in which order. On the stage type
        `1` the model head is trained with *Adam* optimizer; the stage type
        `2` is similar, but the optimizer is *SGD*; the stage type `3` is only
        relevant when **word_emb_type** is `'bert'` and we want to tune the
        whole model. Stage type `0` defines the skip-stage, i.e. there would
        be no real training on it. It is used when you need reproducibility
        and want to continue train the model from some particular stage. In
        this case, you specify the name of the model saved on that stage in
        the parametere **load_from**, and put zeros into the **stages** list
        on the places of already finished ones. One more time: it is used for
        reproducibility only, i.e. when you put some particular value to the
        **seed** param and want the data order in bathes be equivalent with
        data on the stages from the past trainings.

        **save_stages** (`bool`; default is `False`): if we need to keep the
        best model of each stage beside of the overall best model. The names
        of these models would have the suffix `_<idx>(stage<stage_type>)`
        where `<idx>` is an ordinal number of the stage. We can then use it to
        continue training from any particular stage number (changing next
        stages or their parameters) using the parameter **load_from**. Note
        that we save only stages of the head model. The embedding model as a
        part of the full model usually tune only once, so we don't make its
        copy.

        **load_from** (`str`; default is `None`): if you want to continue
        training from one of previously saved stages, you can specify the name
        of the model from that stage. Note, that if your model is already
        trained on stage type `3`, then you want to set param
        **word_emb_path** to `None`. Otherwise, you'll load wrong embedding
        model. Any other params of the model may be overwritten (and most
        likely, this would cause error), but they are equivalent when the
        training is just starts and when it's continues. But the
        **word_emb_path** is different if you already passed stage type `3`,
        so don't forget to set it to `None` in that case. (Example: you want
        to repeat training on stage no `5`, so you specify in the
        **load_from** param something like `'model_4(stage1)'` and set the
        **word_emb_path** to `None` and the **stages_param** to
        `'[0, 0, 0, 0, 2]'` (or, if you don't care of reproducibility, you
        could just specify `[2]` here).

        *Other options*:

        **learn_on_padding** (`bool`; default is `True`): while training, we
        can calculate loss either taking in account predictions made for
        padding tokens or without it. The common practice is don't use padding
        when calculate loss. However, we note that using padding sometimes
        makes the resulting model performance slightly better.

        **remove_padding_intent** (`bool`; default is `False`): if you set
        **learn_on_padding** param to `False`, you may want not to use padding
        intent during training at all. I.e. padding tokens would be tagged
        with some of real tags, and they would just ignored during computing
        loss. As a result, the model would have the output dimensionality of
        the final layer less by one. On the first sight, such approach could
        increase the performance, but in our experiments, such effect appeared
        not always.

        **seed** (`int`; default is `None`): init value for the random number
        generator if you need reproducibility. Note that each stage will have
        its own seed value, and the **seed** param is used to calculate these
        values.

        **start_time** (`float`; default is `None`): the result of
        `time.time()` to start with. If `None`, the arg will be init anew.

        **keep_embs** (`bool`; default is `False`): by default, after creating
        `Dataset` objects, we remove word embedding models to free memory.
        With `keep_embs=False` this operation is omitted, and you can use
        `.embs` attribute for share embedding models with other objects.

        **log_file** (`file`; default is `sys.stdout`): the stream for info
        messages.

        *The model hyperparameters*:

        **rnn_emb_dim** (`int`; default is `None`): the internal character RNN
        (LSTM) embedding dimensionality. If `None`, the layer is skipped.

        **cnn_emb_dim** (`int`; default is `None`): the internal character CNN
        embedding dimensionality. If `None`, the layer is skipped.

        **cnn_kernels** (`list([int])`; default is `[1, 2, 3, 4, 5, 6]`): CNN
        kernel sizes of the internal CNN embedding layer. Relevant if
        **cnn_emb_dim** is not `None`.

        **emb_bn** (`bool`; default is `True`): whether batch normalization
        layer should be applied after the embedding concatenation.

        **emb_do** (`float`; default is `.2`): the dropout rate after the
        embedding concatenation.

        **final_emb_dim** (`int`; default is `512`): the output dimesionality
        of the linear transformation applying to concatenated embeddings.

        **pre_bn** (`bool`; default is `True`): whether batch normalization
        layer should be applied before the main part of the algorithm.

        **pre_do** (`float`; default is `.5`): the dropout rate before the
        main part of the algorithm.

        **lstm_layers** (`int`; default is `1`): the number of Bidirectional
        LSTM layers. If `None`, they are not created.

        **lstm_do** (`float`; default is `0`): the dropout between LSTM
        layers. Only relevant, if `lstm_layers` > `1`.

        **tran_layers** (`int`; default is `None`): the number of Transformer
        Encoder layers. If `None`, they are not created.

        **tran_heads** (`int`; default is `8`): the number of attention heads
        of Transformer Encoder layers. Only relevant, if `tran_layers` > `1`.

        **post_bn** (`bool`; default is `True`): whether batch normalization
        layer should be applied after the main part of the algorithm.

        **post_do** (`float`; default is `.4`): the dropout rate after the
        main part of the algorithm.

        The method returns the train statistics.
        """
        if not start_time:
            start_time = time.time()
        args, kwargs = get_func_params(UposTagger.train, locals())

        self._cdict = CorpusDict(
            corpus=(x for x in [self._train_corpus,
                                self._test_corpus if self._test_corpus else
                                []]
                      for x in x),
            format='conllu_parsed', log_file=log_file
        )

        return super().train(self._field, None, UposTaggerModel, None,
                             *args, **kwargs)

Ejemplo n.º 4

Mostrar archivo

    def __init__(self):
        self._cdict = CorpusDict(log_file=LOG_FILE)

        self._train_corpus = []
        self._test_corpus = []

Ejemplo n.º 5

Mostrar archivo

class BaseParser:
    """Base class for all parsers of the project"""
    def __init__(self):
        self._cdict = CorpusDict(log_file=LOG_FILE)

        self._train_corpus = []
        self._test_corpus = []

    def backup(self):
        """Get current state"""
        return {'cdict_backup': self._cdict.backup()}

    def restore(self, o):
        """Restore current state from backup object"""
        cdict_backup = o.get('cdict_backup')
        if cdict_backup:
            self._cdict.restore(cdict_backup)

    def save(self, file_path):
        print('Saving model...', end=' ', file=LOG_FILE)
        LOG_FILE.flush()
        with open(file_path, 'wb') as f:
            pickle.dump(self.backup(), f, 2)
            print('done.', file=LOG_FILE)

    def load(self, file_path):
        print('Loading model...', end=' ', file=LOG_FILE)
        LOG_FILE.flush()
        with open(file_path, 'rb') as f:
            o = pickle.load(f)
            print('done.', file=LOG_FILE)
            self.restore(o)

    def _save_cdict(self, file_path):
        self._cdict.backup_to(file_path)

    def _load_cdict(self, file_path, log_file=LOG_FILE):
        self._cdict.restore_from(file_path, log_file=log_file)

    @staticmethod
    def load_conllu(*args, **kwargs):
        """Wrapper for ``Conllu.load()``"""
        silent = kwargs.pop('silent', None)
        if silent:
            kwargs['log_file'] = None
        elif 'log_file' not in kwargs:
            kwargs['log_file'] = LOG_FILE
        return Conllu.load(*args, **kwargs)

    @staticmethod
    def save_conllu(*args, **kwargs):
        """Wrapper for ``Conllu.save()``"""
        silent = kwargs.pop('silent', None)
        if silent:
            kwargs['log_file'] = None
        elif 'log_file' not in kwargs:
            kwargs['log_file'] = LOG_FILE
        return Conllu.save(*args, **kwargs)

    @staticmethod
    def split_corpus(corpus,
                     split=[.8, .1, .1],
                     save_split_to=None,
                     seed=None,
                     silent=False):
        """Split a *corpus* in the given proportion.

        :param corpus: a name of file in CoNLL-U format or list/iterator of
                       sentences in Parsed CoNLL-U
        :param split: list of sizes of the necessary *corpus* parts. If values
                      are of int type, they are interpreted as lengths of new
                      corpora in sentences; if values are float, they are
                      proportions of a given *corpus*. The types of the
                      *split* values can't be mixed: they are either all int,
                      or all float. The sum of float values must be less or
                      equals to 1; the sum of int values can't be greater than
                      the lentgh of the *corpus*
        :param save_split_to: list of file names to save the result of the
                              *corpus* splitting. Can be `None` (default;
                              don't save parts to files) or its length must be
                              equal to the length of *split*
        :param silent: if True, suppress output
        :return: a list of new corpora
        """
        assert save_split_to is None or len(save_split_to) == len(split), \
               'ERROR: lengths of split and save_split_to must be equal'
        isfloat = len([x for x in split if isinstance(x, float)]) > 0
        if isfloat:
            assert sum(split) <= 1, \
                   "ERROR: sum of split can't be greater that 1"
        corpus = list(
            Conllu.load(corpus, log_file=None if silent else LOG_FILE))
        corpus_len = len(corpus)
        if isfloat:
            split = list(map(lambda x: round(corpus_len * x), split))
            diff = corpus_len - sum(split)
            if abs(diff) == 1:
                split[-1] += diff
        assert sum(split) <= corpus_len, \
               "ERROR: sum of split can't be greater that corpus length"
        random.seed(seed)
        random.shuffle(corpus)
        res = []
        pos_b = 0
        for i, sp in enumerate(split):
            pos_e = pos_b + sp
            corpus_ = corpus[pos_b:pos_e]
            pos_b = pos_e
            if save_split_to:
                Conllu.save(corpus_, save_split_to[i])
            res.append(corpus_)
        return res

    @classmethod
    def _get_corpus(cls, corpus, asis=False, silent=False):
        if isinstance(corpus, str):
            corpus = cls.load_conllu(corpus, silent=silent)
        return (s[0] if not asis and isinstance(s, tuple) else s
                for s in corpus)

    def _predict_sents(self, sentences, predict_method, save_to=None):
        silent_save = False
        if sentences is None:
            sentences = deepcopy(self._test_corpus)
            slient_save = True
        elif isinstance(sentences, type) and issubclass(
                sentences, _AbstractCorpus):
            sentences = sentences.test()
        assert sentences, 'ERROR: Sentences must not be empty'
        sentences = self._get_corpus(sentences, asis=True)
        sentences = predict_method(sentences)
        if save_to:
            self.save_conllu(sentences, save_to, silent=silent_save)
            sentences = self._get_corpus(save_to, asis=True)
        return sentences

    def parse_train_corpus(self, cnt_thresh=None, ambiguity_thresh=None):
        """Create a CorpusDict for train corpus(es) loaded. For one instance it
        may be used only once. Use ``load_train_corpus()`` with append=True
        to append one more corpus to the CorpusDict after it's created.

        :type cnt_thresh: int
        :type ambiguity_thresh: float
        Params for ``CorpusDict.fit()`` method. If omitted then default values
        will be used
        """
        assert self._train_corpus, 'ERROR: Train corpus is not loaded yet'

        print('Train:', end=' ', file=LOG_FILE)
        kwargs = {}
        if cnt_thresh is not None:
            kwargs['cnt_thresh'] = cnt_thresh
        if ambiguity_thresh is not None:
            kwargs['ambiguity_thresh'] = ambiguity_thresh
        self._cdict = CorpusDict(corpus=self._train_corpus,
                                 format='conllu_parsed',
                                 **kwargs,
                                 log_file=LOG_FILE)

    def load_train_corpus(self,
                          corpus,
                          append=False,
                          parse=False,
                          test=None,
                          seed=None):
        """Load train corpus and (possibly) create a CorpusDict for it

        :param corpus: a name of file in CoNLL-U format or list/iterator of
                       sentences in Parsed CoNLL-U
        :param append: add corpus to already loaded one(s)
        :param parse: extract corpus statistics to CorpusDict right after
                      loading
        :param test: if not None, then train corpus will be shuffled and
                     specified part of it stored as test corpus
        :type test: float
        :param seed: init value for the random number generator. Used only 
                     if test is not None
        :type seed: int
        """
        assert append or not self._train_corpus, \
               'ERROR: Train corpus is already loaded. Use append=True to ' \
               'append one more corpus'

        print('Train:', end=' ', file=LOG_FILE)
        if (isinstance(corpus, type) and issubclass(corpus, _AbstractCorpus)) \
        or isinstance(corpus, _AbstractCorpus):
            corpus = corpus.train()
        corpus = self._get_corpus(corpus)
        if test is not None:
            corpus = list(corpus)
            assert test >= 0 and test <= 1, \
                   'ERROR: A value of "test" parameter must be between 0 and 1'
            test_len = round(len(corpus) * test)
            if test_len == 0:
                print('WARNING: All the corpus given will be saved as train ' \
                      'corpus (parameter "test" is too small)', file=LOG_FILE)
            else:
                if test_len == len(corpus):
                    print('WARNING: All the corpus given will be saved as ' \
                          'test corpus (parameter "test" is too large)',
                          file=LOG_FILE)
                    self.load_test_corpus(corpus, append=append)
                    corpus = None
                else:
                    random.seed(seed)
                    random.shuffle(corpus)
                    self.load_test_corpus(corpus[:test_len], append=append)
                    corpus = corpus[test_len:]
                print('stored.', file=LOG_FILE)
        if corpus:
            if append == False:
                self._train_corpus = []
            self._train_corpus.extend(corpus)
            if parse:
                self._cdict = CorpusDict(corpus=self._train_corpus,
                                         format='conllu_parsed',
                                         log_file=LOG_FILE)

    def load_test_corpus(self, corpus, append=False):
        """Load development test corpus to validate on during training
        iterations.

        :param corpus: a name of file in CoNLL-U format or list/iterator of
                       sentences in Parsed CoNLL-U
        :param append: add corpus to already loaded one(s)
        """
        assert append or not self._test_corpus, \
               'ERROR: Test corpus is already loaded. Use append=True to ' \
               'append one more corpus'

        print('Test:', end=' ', file=LOG_FILE)
        if (isinstance(corpus, type) and issubclass(corpus, _AbstractCorpus)) \
        or isinstance(corpus, _AbstractCorpus):
            try:
                corpus = corpus.dev()
            except ValueError:
                try:
                    corpus = corpus.test()
                except ValueError:
                    raise ValueError(
                        ('ERROR: {} does not have a test part. '
                         'Use "test" attribute of '
                         'load_train_corpus() instead').format(corpus.name))
        corpus = self._get_corpus(corpus)
        if append == False:
            self._test_corpus = []
        self._test_corpus.extend(corpus)

    def remove_rare_feats(self,
                          abs_thresh=None,
                          rel_thresh=None,
                          full_rel_thresh=None):
        """Remove feats from train and test corpora, occurence of which
        in the train corpus is less then a threshold.

        :param abs_thresh: remove features if their count in the train corpus
                           is less than this value
        :type abs_thresh: int
        :param rel_thresh: remove features if their frequency with respect to
                           total feats count of the train corpus is less than
                           this value
        :type rel_thresh: float
        :param full_rel_thresh: remove features if their frequency with respect
                                to the full count of the tokens of the train
                                corpus is less than this value
        :type full_rell_thresh: float

        *rell_thresh* and *full_rell_thresh* must be between 0 and 1"""
        corpus_len = len(self._train_corpus)
        progress_step = max(int(corpus_len / 60), 1000)
        progress_check_step = min(int(corpus_len / 100), 1000)

        print('Search rare feats...', file=LOG_FILE)
        feat_freq = {}
        token_cnt = 0
        sent_no = -1
        for sent_no, sentence in enumerate(self._train_corpus):
            if not sent_no % progress_check_step:
                print_progress(sent_no,
                               end_value=corpus_len,
                               step=progress_step)
            for token in sentence:
                token_cnt += 1
                feats = token['FEATS']
                for feat, _ in feats.items():
                    feat_freq[feat] = feat_freq.setdefault(feat, 0) + 1
        print_progress(sent_no + 1,
                       end_value=corpus_len,
                       step=progress_step,
                       file=LOG_FILE)
        total_cnt = sum(feat_freq.values())
        feats_to_remove = sorted(
            feat for feat, cnt in feat_freq.items()
            if (abs_thresh and cnt < abs_thresh) or (
                rel_thresh and cnt / total_cnt < rel_thresh) or (
                    full_rel_thresh and cnt / token_cnt < rull_rel_thresh))

        if not feats_to_remove:
            print('Finished: Nothing to do.', file=LOG_FILE)
        else:
            print('Rare feats:', feats_to_remove, file=LOG_FILE)
            train_removes = test_removes = 0
            for i, corpus in enumerate([self._train_corpus,
                                        self._test_corpus]):
                corpus_len = len(corpus)
                progress_step = max(int(corpus_len / 60), 1000)
                progress_check_step = min(int(corpus_len / 100), 1000)

                sent_no = -1
                for sent_no, sentence in enumerate(corpus):
                    if not sent_no % progress_check_step:
                        print_progress(sent_no,
                                       end_value=corpus_len,
                                       step=progress_step)
                    for token in sentence:
                        feats = token['FEATS']
                        for feat in feats_to_remove:
                            val = feats.pop(feat, None)
                            if val:
                                if i:
                                    test_removes += 1
                                else:
                                    train_removes += 1
                print_progress(sent_no + 1,
                               end_value=corpus_len,
                               step=progress_step,
                               file=LOG_FILE)
            print(('Finished: {} feats removes from the train corpus, '
                   '{} from the test corpus').format(train_removes,
                                                     test_removes),
                  file=LOG_FILE)

    def _train_init(self, epochs, seed, allow_empty_cdict=False):
        cdict = self._cdict
        assert self._train_corpus, 'ERROR: Train corpus is not loaded'
        if not allow_empty_cdict:
            assert not cdict.isempty(), \
                   'ERROR: Train corpus is not yet prepared'
        epochs, epochs_ = epochs if isinstance(epochs, tuple) else \
                          (epochs, 0) if epochs >= 0 else \
                          (1, epochs)
        if epochs == 0:
            epochs = 1
        if epochs_ > 0:
            epochs = -epochs_
        assert epochs_ != 0 or self._test_corpus, \
               'ERROR: epochs < 0 may be used only with a test corpus'
        corpus_len = len(self._train_corpus)
        progress_step = max(int(corpus_len / 60), 1000)
        progress_check_step = min(int(corpus_len / 100), 1000)
        random.seed(seed)
        return cdict, corpus_len, progress_step, progress_check_step, \
                                                               epochs, epochs_

    def _train_eval(self, model, epoch, epochs, epochs_, best_epoch,
                    best_score, best_weights, eqs, bads, prev_score, td, fd,
                    td2, fd2, tp, fp, c, n, no_train_evals, f_evaluate,
                    f_evaluate_args):
        if td is not None:
            print('dict   : correct: {}, wrong: {}, accuracy: {}'.format(
                td, fd,
                round(td / (td + fd), 4) if td + fd > 0 else 1.),
                  file=LOG_FILE)
        if td2 is not None:
            print('dict_s : correct: {}, wrong: {}, accuracy: {}'.format(
                td2, fd2,
                round(td2 / (td2 + fd2), 4) if td2 + fd2 > 0 else 1.),
                  file=LOG_FILE)
        if tp is not None:
            print('predict: correct: {}, wrong: {}, accuracy: {}'.format(
                tp, fp,
                round(tp / (tp + fp), 4) if tp + fp > 0 else 1.),
                  file=LOG_FILE)
        sp1 = ' ' * (len(str(c)) + len(str(n)))
        sp2 = ' ' * (8 + len(str(tp)) + len(str(fp)) - len(sp1))
        print('accuracy: train during train:' + sp2 +
              '{}/{}={}'.format(c, n,
                                round(c / n, 4) if n > 0 else 1.),
              file=LOG_FILE)
        if not no_train_evals:
            print(' ' * 10 + 'train after train :  ' + sp1 + sp2 + str(
                round(
                    f_evaluate(gold=self._train_corpus,
                               silent=True,
                               **f_evaluate_args), 4)),
                  file=LOG_FILE)
        if self._test_corpus:
            weights = deepcopy(model.weights)
            model.average_weights()
            score = f_evaluate(silent=True, **f_evaluate_args)
            iseq = isclose(score, best_score)
            if score > best_score:
                bads = 0
            elif score <= prev_score:
                bads += 1
            print('Effective test accuracy: {} {}{}'.format(
                score, '==' if iseq else
                '>>' if score > best_score else '<< >' if score > prev_score
                else '<< =' if score == prev_score else '<< <',
                ' ({})'.format(bads) if bads else ''),
                  file=LOG_FILE)
            if iseq:
                eqs += 1
            else:
                eqs = 0
                if score > best_score:
                    best_epoch, best_score, best_weights = \
                        epoch, score, model.weights
            if epochs_ and epoch + 1 == epochs:
                epochs = epochs_
            if epoch + 1 == epochs or (epochs < 0 and bads >= -epochs
                                       ):  #best_epoch - epoch <= epochs):
                if eqs != epoch - best_epoch:
                    print('Search finished. Return to Epoch',
                          best_epoch,
                          file=LOG_FILE)
                    model.weights = best_weights
                else:
                    print('Search finished', file=LOG_FILE)
                eqs = -1
            else:
                model.weights = weights
        else:
            score = prev_score
        return epoch + 1, epochs, best_epoch, best_score, best_weights, \
               eqs, bads, score

    def _train_done(self, header, model, eqs, no_train_evals, f_evaluate,
                    f_evaluate_args):
        if eqs >= 0:
            model.average_weights()
        res = None
        if not no_train_evals or self._test_corpus:
            print('Final {} {}{}{} accuracy:'.format(
                header, 'train' if not no_train_evals else '',
                '/' if not no_train_evals and self._test_corpus else '',
                'test' if self._test_corpus else ''),
                  end=' ',
                  file=LOG_FILE)
            if not no_train_evals:
                print(round(
                    f_evaluate(gold=self._train_corpus,
                               silent=True,
                               **f_evaluate_args), 4),
                      end='',
                      file=LOG_FILE)
            if not no_train_evals and self._test_corpus:
                print(' / ', end='', file=LOG_FILE)
            if self._test_corpus:
                res = f_evaluate(silent=True, **f_evaluate_args)
                print(round(res, 4), end='', file=LOG_FILE)
            print(file=LOG_FILE)
        return res