コード例 #1
0
ファイル: core.py プロジェクト: libindic/shingling
 def wshingling(self, text, window_size=4):
     """
     :param text: Text to be split into shingles.
     :type text: str.
     :param window_size: the window size for splitting the shingles.
     :type: int.
     :returns: text split into shingles.
             """
     window_size = int(window_size)
     s = Ngram()
     ngrams = s.wordNgram(text, window_size)
     shingling = []
     for x in ngrams:
         if x not in shingling:
             shingling.append(x)
     return shingling
コード例 #2
0
 def __init__(self):
     """
     Initialize necessary resources.
     """
     self.dictionary_file = open(
         os.path.join(os.path.dirname(__file__), 'data/ml_rootwords.txt'))
     self.dictionary = self.dictionary_file.readlines()
     self.dictionary_file.close()
     try:
         self.dictionary = marisa_trie.Trie(
             [x.strip().decode('utf-8') for x in self.dictionary])
     except:
         self.dictionary = marisa_trie.Trie(
             [x.strip() for x in self.dictionary])
     self.stemmer = Stemmer()
     self.inflector = inflector.Inflector(lang='ml')
     self.soundex = Soundex()
     self.syllabalizer = Syllabifier()
     self.ngrammer = Ngram()