def test_latin_stemmer(self): """Test Latin stemmer.""" sentence = 'Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum.' # pylint: disable=line-too-long stemmer = Stemmer() stemmed_text = stemmer.stem(sentence.lower()) target = 'est interd praestar mercatur r quaerere, nisi tam periculos sit, et it foenerari, si tam honestum. ' # pylint: disable=line-too-long self.assertEqual(stemmed_text, target)
def compare_sentences(self, str_a, str_b, language): """Tokenize two input strings on sentence boundary and return a matrix of Levenshtein distance ratios. :param language: str (language name) :param string_a: str :param string_b: str :return: list [[Comparison]] """ sents_a = [] sents_b = [] ratios = [] # Make the latin tokenizer if language == "latin": sent_tokenizer = TokenizeSentence('latin') # Make the greek tokenizer elif language == "greek": sent_tokenizer = TokenizeSentence('greek') # Otherwise, if language, is unsupported, throw error stating accepted Language # values that may be used to tokenize sentences else: print("Language for sentence tokenization not recognized. " "Accepted values are 'latin' and 'greek'.") return # If class instance is set to stem words, do so if self.stem_words: stemmer = Stemmer() str_a = stemmer.stem(str_a) str_b = stemmer.stem(str_b) # Tokenize input strings sents_a = sent_tokenizer.tokenize_sentences(str_a) sents_b = sent_tokenizer.tokenize_sentences(str_b) # Process sentences for comparison (taking into account sanitization settings) sents_a = self._process_sentences(sents_a) sents_b = self._process_sentences(sents_b) # Build matrix of edit distance ratios comparisons = self._calculate_ratios(sents_a, sents_b) return comparisons
def compare_sliding_window(self, str_a, str_b, window_length=50, curse_forward=20): """ Compare two strings with a sliding window method based on window_length and curse_forward values :param string_a: str :param string_b: str :param window_length: int :param curse_forward: int :return: list [[Comparison]] """ if self.stem_words: stemmer = Stemmer() str_a = stemmer.stem(str_a) str_b = stemmer.stem(str_b) substrs_a = self._str_to_windows(str_a, window_length, curse_forward) substrs_b = self._str_to_windows(str_b, window_length, curse_forward) # Build comparisons = self._calculate_ratios(substrs_a, substrs_b) return comparisons
def stemmify(self): """Returns text with only stems. An alternate method to lemmatization. Instead of converting to lemmata (principi -> princeps) converts to stemma (principi -> princp) Returns: :obj:`self.__class__` New text with stemma Example: >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') >>> print(text.stemmify()) arm vir cano, troi qui prim ab or """ # noqa from cltk.stem.latin.stem import Stemmer return self.__class__( Stemmer().stem(self.data.lower()), self.options )
word_dict[word.lemmatization.lower()][language]\ .occurrences.append(word) word_dict[word.lemmatization.lower()][language]\ .variations.add(word.text.lower()) word_dict[word.lemmatization.lower()][language]\ .files.add(word.file_name) word_dict[word.lemmatization.lower()][language]\ .language = word.language word_dict[word.lemmatization.lower()][language]\ .lemma = word.lemmatization word_dict[word.lemmatization.lower()][language]\ .regions.add(file_dict[word.file_name].region) check_suspicious(word_dict[word.lemmatization.lower()][word.language]) #endloop la_stemmer = Stemmer() for key in word_dict: for language in word_dict[key]: word = word_dict[key][language] if "transl" in language: word.frequency_total = \ len(word.occurrences) / translated_occurrence_count else: word.frequency_total = \ len(word.occurrences) / untranslated_occurrence_count word.frequency_language = \ len(word.occurrences) / lang_count[word.language] if language == "la": word.stem = la_stemmer.stem(word.lemma) # Sort according to the given arguments before writing to file
def stem(form): stemmer = Stemmer() string = stemmer.stem(form) return json.dumps(string)
def get(self, sentence): stemmer = Stemmer() return {'stemmed_output': stemmer.stem(sentence.lower())}