def identify_language(self, text): self.lang = lang_mapping[langid.classify(text)[0]] if self.debug: print "LANG", self.lang#, "stemmer", self.stem if self.lang == "greek": from stemmers.greek import stem, stopwords self.stem = stem self.legal_token = partial(self.legal_token, exclude_list=stopwords) elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk import snowballstemmer from stemmers.turkish import stopwords self.stem = snowballstemmer.stemmer("turkish").stemWord self.legal_token = partial(self.legal_token, exclude_list=stopwords) else: from nltk.stem import SnowballStemmer from nltk.corpus import stopwords self.stem = SnowballStemmer(self.lang).stem self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))
def identify_language(self, text): # we need different language detection on indexing vs quering (for speed) if self.__class__.__name__ == "QueryHandler": from sensitive_language_detection import check_lang else: from quick_language_detection import check_lang self.lang = check_lang(text) if self.lang == "greek": from stemmers.greek import stem, stopwords self.stem = stem self.legal_token = partial(self.legal_token, exclude_list=stopwords) else: from nltk.stem import SnowballStemmer from nltk.corpus import stopwords self.stem = SnowballStemmer(self.lang).stem self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang)) if self.debug: print "LANG", self.lang # , "stemmer", self.stem