def wshingling(self, text, window_size=4): """ :param text: Text to be split into shingles. :type text: str. :param window_size: the window size for splitting the shingles. :type: int. :returns: text split into shingles. """ window_size = int(window_size) s = Ngram() ngrams = s.wordNgram(text, window_size) shingling = [] for x in ngrams: if x not in shingling: shingling.append(x) return shingling
def __init__(self): """ Initialize necessary resources. """ self.dictionary_file = open( os.path.join(os.path.dirname(__file__), 'data/ml_rootwords.txt')) self.dictionary = self.dictionary_file.readlines() self.dictionary_file.close() try: self.dictionary = marisa_trie.Trie( [x.strip().decode('utf-8') for x in self.dictionary]) except: self.dictionary = marisa_trie.Trie( [x.strip() for x in self.dictionary]) self.stemmer = Stemmer() self.inflector = inflector.Inflector(lang='ml') self.soundex = Soundex() self.syllabalizer = Syllabifier() self.ngrammer = Ngram()