Ejemplo n.º 1
0
 def test_latin_stemmer(self):
     """Test Latin stemmer."""
     sentence = 'Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum.'  # pylint: disable=line-too-long
     stemmer = Stemmer()
     stemmed_text = stemmer.stem(sentence.lower())
     target = 'est interd praestar mercatur r quaerere, nisi tam periculos sit, et it foenerari, si tam honestum. '  # pylint: disable=line-too-long
     self.assertEqual(stemmed_text, target)
Ejemplo n.º 2
0
 def test_latin_stemmer(self):
     """Test Latin stemmer."""
     sentence = 'Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum.'  # pylint: disable=line-too-long
     stemmer = Stemmer()
     stemmed_text = stemmer.stem(sentence.lower())
     target = 'est interd praestar mercatur r quaerere, nisi tam periculos sit, et it foenerari, si tam honestum. '  # pylint: disable=line-too-long
     self.assertEqual(stemmed_text, target)
Ejemplo n.º 3
0
    def compare_sentences(self, str_a, str_b, language):
        """Tokenize two input strings on sentence boundary and return a
        matrix of Levenshtein distance ratios.
        :param language: str (language name)
        :param string_a: str
        :param string_b: str
        :return: list [[Comparison]]
        """

        sents_a = []
        sents_b = []
        ratios = []

        # Make the latin tokenizer
        if language == "latin":
            sent_tokenizer = TokenizeSentence('latin')

        # Make the greek tokenizer
        elif language == "greek":
            sent_tokenizer = TokenizeSentence('greek')

        # Otherwise, if language, is unsupported, throw error stating accepted Language
        # values that may be used to tokenize sentences
        else:
            print("Language for sentence tokenization not recognized. "
                  "Accepted values are 'latin' and 'greek'.")
            return

        # If class instance is set to stem words, do so
        if self.stem_words:
            stemmer = Stemmer()
            str_a = stemmer.stem(str_a)
            str_b = stemmer.stem(str_b)

        # Tokenize input strings
        sents_a = sent_tokenizer.tokenize_sentences(str_a)
        sents_b = sent_tokenizer.tokenize_sentences(str_b)

        # Process sentences for comparison (taking into account sanitization settings)
        sents_a = self._process_sentences(sents_a)
        sents_b = self._process_sentences(sents_b)

        # Build matrix of edit distance ratios
        comparisons = self._calculate_ratios(sents_a, sents_b)

        return comparisons
Ejemplo n.º 4
0
    def compare_sliding_window(self, str_a, str_b, window_length=50, curse_forward=20):
        """
        Compare two strings with a sliding window method based on window_length and curse_forward values
        :param string_a: str
        :param string_b: str
        :param window_length: int
        :param curse_forward: int
        :return: list [[Comparison]]
        """

        if self.stem_words:
            stemmer = Stemmer()
            str_a = stemmer.stem(str_a)
            str_b = stemmer.stem(str_b)

        substrs_a = self._str_to_windows(str_a, window_length, curse_forward)
        substrs_b = self._str_to_windows(str_b, window_length, curse_forward)

        # Build
        comparisons = self._calculate_ratios(substrs_a, substrs_b)

        return comparisons
Ejemplo n.º 5
0
    def stemmify(self):
        """Returns text with only stems.

        An alternate method to lemmatization. Instead of converting to lemmata
        (principi -> princeps) converts to stemma (principi -> princp)

        Returns:
            :obj:`self.__class__` New text with stemma

        Example:
            >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris')
            >>> print(text.stemmify())
            arm vir cano, troi qui prim ab or
        """ # noqa
        from cltk.stem.latin.stem import Stemmer
        return self.__class__(
            Stemmer().stem(self.data.lower()),
            self.options
        )
Ejemplo n.º 6
0
            word_dict[word.lemmatization.lower()][language]\
             .occurrences.append(word)
            word_dict[word.lemmatization.lower()][language]\
             .variations.add(word.text.lower())
            word_dict[word.lemmatization.lower()][language]\
             .files.add(word.file_name)
            word_dict[word.lemmatization.lower()][language]\
             .language = word.language
            word_dict[word.lemmatization.lower()][language]\
             .lemma = word.lemmatization
            word_dict[word.lemmatization.lower()][language]\
             .regions.add(file_dict[word.file_name].region)
        check_suspicious(word_dict[word.lemmatization.lower()][word.language])
    #endloop

    la_stemmer = Stemmer()
    for key in word_dict:
        for language in word_dict[key]:
            word = word_dict[key][language]
            if "transl" in language:
                word.frequency_total = \
                 len(word.occurrences) / translated_occurrence_count
            else:
                word.frequency_total = \
                 len(word.occurrences) / untranslated_occurrence_count
            word.frequency_language = \
             len(word.occurrences) / lang_count[word.language]
            if language == "la":
                word.stem = la_stemmer.stem(word.lemma)

    # Sort according to the given arguments before writing to file
Ejemplo n.º 7
0
def stem(form):
    stemmer = Stemmer()
    string = stemmer.stem(form)
    return json.dumps(string)
Ejemplo n.º 8
0
 def get(self, sentence):
     stemmer = Stemmer()
     return {'stemmed_output': stemmer.stem(sentence.lower())}