def tokenize(self, data, doc_no): # create token dictionary for the book book_dict = {} # split the data string by space words = data.split() doclen = len(words) for w in words: # return processed word, 0 is not qualified processed_word = TokenRules.apply(w) # filter empty word after tokenization if processed_word != 0: # apply stemming if self.applyStemming == 1: processed_word = Stemmer.stemming(processed_word) # apply lemmatization # add new key to dictionary if self.applyLemma == 1: processed_word = Lemmatizer.lemmatizing(processed_word) if processed_word not in book_dict: book_dict[processed_word] = 1 # increment value by 1 if key exist else: book_dict[processed_word] += 1 doc_max_tf = self.add_doc_info(book_dict) return doc_no, doclen, doc_max_tf, book_dict
def token_list_stemming(self, words): final_words = [] for w in words: # return processed word, 0 is not qualified processed_word = TokenRules.apply(w) # filter empty word after tokenization if processed_word != 0: # apply stemming processed_word = Stemmer.stemming(processed_word) final_words.append(processed_word) return final_words