Example #1
0
 def test_latin_stemmer(self):
     """Test Latin stemmer."""
     sentence = 'Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum.'  # pylint: disable=line-too-long
     stemmer = Stemmer()
     stemmed_text = stemmer.stem(sentence.lower())
     target = 'est interd praestar mercatur r quaerere, nisi tam periculos sit, et it foenerari, si tam honestum. '  # pylint: disable=line-too-long
     self.assertEqual(stemmed_text, target)
Example #2
0
 def test_latin_stemmer(self):
     """Test Latin stemmer."""
     sentence = 'Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum.'  # pylint: disable=line-too-long
     stemmer = Stemmer()
     stemmed_text = stemmer.stem(sentence.lower())
     target = 'est interd praestar mercatur r quaerere, nisi tam periculos sit, et it foenerari, si tam honestum. '  # pylint: disable=line-too-long
     self.assertEqual(stemmed_text, target)
Example #3
0
    def compare_sentences(self, str_a, str_b, language):
        """Tokenize two input strings on sentence boundary and return a
        matrix of Levenshtein distance ratios.
        :param language: str (language name)
        :param string_a: str
        :param string_b: str
        :return: list [[Comparison]]
        """

        sents_a = []
        sents_b = []
        ratios = []

        # Make the latin tokenizer
        if language == "latin":
            sent_tokenizer = TokenizeSentence('latin')

        # Make the greek tokenizer
        elif language == "greek":
            sent_tokenizer = TokenizeSentence('greek')

        # Otherwise, if language, is unsupported, throw error stating accepted Language
        # values that may be used to tokenize sentences
        else:
            print("Language for sentence tokenization not recognized. "
                  "Accepted values are 'latin' and 'greek'.")
            return

        # If class instance is set to stem words, do so
        if self.stem_words:
            stemmer = Stemmer()
            str_a = stemmer.stem(str_a)
            str_b = stemmer.stem(str_b)

        # Tokenize input strings
        sents_a = sent_tokenizer.tokenize_sentences(str_a)
        sents_b = sent_tokenizer.tokenize_sentences(str_b)

        # Process sentences for comparison (taking into account sanitization settings)
        sents_a = self._process_sentences(sents_a)
        sents_b = self._process_sentences(sents_b)

        # Build matrix of edit distance ratios
        comparisons = self._calculate_ratios(sents_a, sents_b)

        return comparisons
Example #4
0
    def compare_sliding_window(self, str_a, str_b, window_length=50, curse_forward=20):
        """
        Compare two strings with a sliding window method based on window_length and curse_forward values
        :param string_a: str
        :param string_b: str
        :param window_length: int
        :param curse_forward: int
        :return: list [[Comparison]]
        """

        if self.stem_words:
            stemmer = Stemmer()
            str_a = stemmer.stem(str_a)
            str_b = stemmer.stem(str_b)

        substrs_a = self._str_to_windows(str_a, window_length, curse_forward)
        substrs_b = self._str_to_windows(str_b, window_length, curse_forward)

        # Build
        comparisons = self._calculate_ratios(substrs_a, substrs_b)

        return comparisons
Example #5
0
    #endloop

    la_stemmer = Stemmer()
    for key in word_dict:
        for language in word_dict[key]:
            word = word_dict[key][language]
            if "transl" in language:
                word.frequency_total = \
                 len(word.occurrences) / translated_occurrence_count
            else:
                word.frequency_total = \
                 len(word.occurrences) / untranslated_occurrence_count
            word.frequency_language = \
             len(word.occurrences) / lang_count[word.language]
            if language == "la":
                word.stem = la_stemmer.stem(word.lemma)

    # Sort according to the given arguments before writing to file
    sort_order = []
    if args.sort != None:
        for e in args.sort:
            if e == 'l':
                sort_order.append("language")
            elif e == 't' or e == 'a':
                sort_order.append("text")
            elif e == 'f':
                sort_order.append("file_name")
            elif e == "e":
                sort_order.append("edition_type")
            else:
                print("Invalid sort criterion: '" + e + "'")
Example #6
0
def stem(form):
    stemmer = Stemmer()
    string = stemmer.stem(form)
    return json.dumps(string)
Example #7
0
 def get(self, sentence):
     stemmer = Stemmer()
     return {'stemmed_output': stemmer.stem(sentence.lower())}