Beispiel #1
0
    def tokenize(self, data, doc_no):
        # create token dictionary for the book
        book_dict = {}

        # split the data string by space
        words = data.split()
        doclen = len(words)
        for w in words:
            # return processed word, 0 is not qualified
            processed_word = TokenRules.apply(w)
            # filter empty word after tokenization
            if processed_word != 0:
                # apply stemming
                if self.applyStemming == 1:
                    processed_word = Stemmer.stemming(processed_word)
                # apply lemmatization
                # add new key to dictionary
                if self.applyLemma == 1:
                    processed_word = Lemmatizer.lemmatizing(processed_word)
                if processed_word not in book_dict:
                    book_dict[processed_word] = 1
                # increment value by 1 if key exist
                else:
                    book_dict[processed_word] += 1
            doc_max_tf = self.add_doc_info(book_dict)
        return doc_no, doclen, doc_max_tf, book_dict
Beispiel #2
0
 def token_list_stemming(self, words):
     final_words = []
     for w in words:
         # return processed word, 0 is not qualified
         processed_word = TokenRules.apply(w)
         # filter empty word after tokenization
         if processed_word != 0:
             # apply stemming
             processed_word = Stemmer.stemming(processed_word)
             final_words.append(processed_word)
     return final_words