Ejemplo n.º 1
0
class Model(Template):
    def _model_definition(self) -> UnigramTagger:
        """Function to define and compile the model.
        
        Returns:
          Model object.
        """
        t0 = DefaultTagger('NOUN')
        return UnigramTagger([[(".", "PUNCT")]], backoff=t0)

    def train(self,
              corpus: Corpus,
              evaluate: bool = True,
              config: dict = None) -> Union[None, Dict[str, Dict[str, float]]]:
        """Train method.

        Args:
          corpus: Corpus to train model.
          evaluate: Flag to return evaluation of the model.
          config: Training config dict (not used for this model).

        Returns: 
          Model evaluation metrics.
        """
        if self.model is None:
            self._model_definition()

        self.model = UnigramTagger(corpus.train.sentences,
                                   backoff=DefaultTagger('NOUN'))

        if evaluate:
            return self.evaluate(corpus)
        return None

    def evaluate(
            self, corpus: Union[Corpus,
                                Dataset]) -> Dict[str, Dict[str, float]]:
        """Model metrics evaluation.

        Args:
          corpus: Corpus/Dataset to evaluate model.

        Returns:
          Model evaluation metrics.
        """
        def _tag_extractor(sentence: List[Tuple[str]]) -> List[str]:
            return [token[1] for token in sentence]

        if not isinstance(corpus, Dataset):
            prediction_tags = [
                _tag_extractor(sentence)
                for sentence in self.predict(corpus.train.get_tokens())
            ]
            output = {
                "train":
                model_performance(corpus.train.get_tags(), prediction_tags)
            }

            if corpus.dev:
                prediction_tags = [
                    _tag_extractor(sentence)
                    for sentence in self.predict(corpus.dev.get_tokens())
                ]
                output['dev'] = model_performance(corpus.dev.get_tags(),
                                                  prediction_tags)

            if corpus.test:
                prediction_tags = [
                    _tag_extractor(sentence)
                    for sentence in self.predict(corpus.test.get_tokens())
                ]
                output['test'] = model_performance(corpus.test.get_tags(),
                                                   prediction_tags)
        else:
            prediction_tags = [
                _tag_extractor(sentence)
                for sentence in self.predict(corpus.get_tokens())
            ]
            output = model_performance(corpus.get_tags(), prediction_tags)

        return output

    def predict(
            self,
            sentences: List[List[str]]) -> Union[List[List[Tuple[str]]], None]:
        """Method to tag tokens from the list of sentences.

        Args:
          sentences: Sentences.
        
        Returns:
          List of lists with tuples of (form, tag)
        """
        if self.model is None:
            return None
        return self.model.tag_sents(sentences)

    def save(self, path: str) -> None:
        """Model saver method.

        Args:
          path: Path to save model into.
        
        Raises:
          IOError, pickle.PicklingError: Occurred on writing/pickling error.
        """
        save_obj_pkl(self.model, path)
        return

    def load(self, path: str) -> UnigramTagger:
        """Model loader method.

        Args:
          path: Path to load model from.
        
        Rises:
          IOError, pickle.UnpicklingError: Occurred when loading/deserializing the obj.
        """
        return load_obj_pkl(path)
Ejemplo n.º 2
0
    # print unigram_affix_backoff.evaluate(test)
    # cutoffs = [x*0.1 for x in range(20)]
    # for c in cutoffs:
    # tagger = EntropyVotingTagger(taggers, c)
    # print "Accuracy of entropy voting = ", tagger.evaluate(test)


    affix_tagger = EntropyAffixTagger(train)
    unigram_tagger = EntropyUnigramTagger(train)
    taggers = [unigram_tagger, affix_tagger]
    tagger = EntropyVotingTagger(taggers, max_entropy=80)

    from nltk.tag import untag

    untagged_test = [untag(x) for x in dev]
    tagged_sents_uni_affix = unigram_affix_backoff.tag_sents(untagged_test)
    tagged_sents_entr = tagger.tag_sents(untagged_test)
    affix_mistake = 0
    unigram_mistake = 0
    overall_mistakes = 0
    print "len of dev: ", len(dev)
    for tagged_reference_sent, tagged_uni_affix_sent, tagged_entropy_sent in izip(dev, tagged_sents_uni_affix,
                                                                                  tagged_sents_entr):
        # import pdb;pdb.set_trace()
        for tagged_reference, tagged_uni_affix, tagged_entropy in izip(tagged_reference_sent, tagged_uni_affix_sent,
                                                                       tagged_entropy_sent):
            if tagged_uni_affix[1] != tagged_entropy[1]:
                overall_mistakes += 1

                print "WE GOT MATCH!"
                print "Word = ", tagged_reference[0]