def test_latin_stemmer(self): """Test Latin stemmer.""" sentence = 'Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum.' # pylint: disable=line-too-long stemmer = Stemmer() stemmed_text = stemmer.stem(sentence.lower()) target = 'est interd praestar mercatur r quaerere, nisi tam periculos sit, et it foenerari, si tam honestum. ' # pylint: disable=line-too-long self.assertEqual(stemmed_text, target)
def compare_sentences(self, str_a, str_b, language): """Tokenize two input strings on sentence boundary and return a matrix of Levenshtein distance ratios. :param language: str (language name) :param string_a: str :param string_b: str :return: list [[Comparison]] """ sents_a = [] sents_b = [] ratios = [] # Make the latin tokenizer if language == "latin": sent_tokenizer = TokenizeSentence('latin') # Make the greek tokenizer elif language == "greek": sent_tokenizer = TokenizeSentence('greek') # Otherwise, if language, is unsupported, throw error stating accepted Language # values that may be used to tokenize sentences else: print("Language for sentence tokenization not recognized. " "Accepted values are 'latin' and 'greek'.") return # If class instance is set to stem words, do so if self.stem_words: stemmer = Stemmer() str_a = stemmer.stem(str_a) str_b = stemmer.stem(str_b) # Tokenize input strings sents_a = sent_tokenizer.tokenize_sentences(str_a) sents_b = sent_tokenizer.tokenize_sentences(str_b) # Process sentences for comparison (taking into account sanitization settings) sents_a = self._process_sentences(sents_a) sents_b = self._process_sentences(sents_b) # Build matrix of edit distance ratios comparisons = self._calculate_ratios(sents_a, sents_b) return comparisons
def compare_sliding_window(self, str_a, str_b, window_length=50, curse_forward=20): """ Compare two strings with a sliding window method based on window_length and curse_forward values :param string_a: str :param string_b: str :param window_length: int :param curse_forward: int :return: list [[Comparison]] """ if self.stem_words: stemmer = Stemmer() str_a = stemmer.stem(str_a) str_b = stemmer.stem(str_b) substrs_a = self._str_to_windows(str_a, window_length, curse_forward) substrs_b = self._str_to_windows(str_b, window_length, curse_forward) # Build comparisons = self._calculate_ratios(substrs_a, substrs_b) return comparisons
#endloop la_stemmer = Stemmer() for key in word_dict: for language in word_dict[key]: word = word_dict[key][language] if "transl" in language: word.frequency_total = \ len(word.occurrences) / translated_occurrence_count else: word.frequency_total = \ len(word.occurrences) / untranslated_occurrence_count word.frequency_language = \ len(word.occurrences) / lang_count[word.language] if language == "la": word.stem = la_stemmer.stem(word.lemma) # Sort according to the given arguments before writing to file sort_order = [] if args.sort != None: for e in args.sort: if e == 'l': sort_order.append("language") elif e == 't' or e == 'a': sort_order.append("text") elif e == 'f': sort_order.append("file_name") elif e == "e": sort_order.append("edition_type") else: print("Invalid sort criterion: '" + e + "'")
def stem(form): stemmer = Stemmer() string = stemmer.stem(form) return json.dumps(string)
def get(self, sentence): stemmer = Stemmer() return {'stemmed_output': stemmer.stem(sentence.lower())}