Beispiel #1
0
    def generate_hash_map(self):
        """
        Method who creates a hash map with all word from current file parsed
        All words are firstly parsed to Porter Algorithm and then inserted to hash map
        :return:
        """

        # clear the hash map
        self._hash_map.clear()

        for line in self._document_content:

            line = line.encode('utf-8')

            line = str(line).translate(PUNCTUATION_TRANS)
            words = line.split()

            for word in words:

                word = word.decode('utf-8-sig')
                word = PorterStemmer().stem(word)
                word = word.lower()

                if word.isalpha():
                    if not self._is_stop_word(word):

                        # if the word is not in hash
                        if word not in self._hash_map:
                            self._hash_map[word] = 1
                        else:
                            self._hash_map[word] += 1