コード例 #1
0
ファイル: index_base.py プロジェクト: ylwb/pyredise
 def identify_language(self, text):
     self.lang = lang_mapping[langid.classify(text)[0]]
     if self.debug: print "LANG", self.lang#, "stemmer", self.stem
     
     if self.lang == "greek":
         from stemmers.greek import stem, stopwords 
         self.stem = stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk
         import snowballstemmer
         from stemmers.turkish import stopwords 
         self.stem = snowballstemmer.stemmer("turkish").stemWord
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     else:
         from nltk.stem import SnowballStemmer
         from nltk.corpus import stopwords
         self.stem = SnowballStemmer(self.lang).stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))
コード例 #2
0
ファイル: index_base.py プロジェクト: nathanwadhwani/pyredise
    def identify_language(self, text):

        # we need different language detection on indexing vs quering (for speed)
        if self.__class__.__name__ == "QueryHandler":
            from sensitive_language_detection import check_lang
        else:
            from quick_language_detection import check_lang

        self.lang = check_lang(text)

        if self.lang == "greek":
            from stemmers.greek import stem, stopwords

            self.stem = stem
            self.legal_token = partial(self.legal_token, exclude_list=stopwords)
        else:
            from nltk.stem import SnowballStemmer
            from nltk.corpus import stopwords

            self.stem = SnowballStemmer(self.lang).stem
            self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))

        if self.debug:
            print "LANG", self.lang  # , "stemmer", self.stem