コード例 #1
0
 def __get_keywords_from_text(text):
     tokens = ViTokenizer.tokenize(text)
     tokens = ViTokenizer.spacy_tokenize(tokens)[0]
     tokens = list(filter(lambda x: len(x) > 1, tokens))
     counter_tokens = Counter(tokens)
     counter_tokens = dict(counter_tokens)
     counter_tokens = dict(
         sorted(counter_tokens.items(), key=lambda x: -x[1]))
     return counter_tokens
コード例 #2
0
 def make_doc(self, text):
     if self.Defaults.use_pyvi:
         try:
             from pyvi import ViTokenizer
         except ImportError:
             msg = ("Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
                    "or install it https://pypi.python.org/pypi/pyvi")
             raise ImportError(msg)
         words, spaces = ViTokenizer.spacy_tokenize(text)
         return Doc(self.vocab, words=words, spaces=spaces)
     else:
         words = []
         spaces = []
         doc = self.tokenizer(text)
         for token in self.tokenizer(text):
             words.extend(list(token.text))
             spaces.extend([False]*len(token.text))
             spaces[-1] = bool(token.whitespace_)
         return Doc(self.vocab, words=words, spaces=spaces)
コード例 #3
0
ファイル: __init__.py プロジェクト: IndicoDataSolutions/spaCy
 def make_doc(self, text):
     if self.Defaults.use_pyvi:
         try:
             from pyvi import ViTokenizer
         except ImportError:
             msg = ("Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
                    "or install it https://pypi.python.org/pypi/pyvi")
             raise ImportError(msg)
         words, spaces = ViTokenizer.spacy_tokenize(text)
         return Doc(self.vocab, words=words, spaces=spaces)
     else:
         words = []
         spaces = []
         doc = self.tokenizer(text)
         for token in self.tokenizer(text):
             words.extend(list(token.text))
             spaces.extend([False]*len(token.text))
             spaces[-1] = bool(token.whitespace_)
         return Doc(self.vocab, words=words, spaces=spaces)