Example #1
0
class Parser:
    res_dict = {}
    res_matrix = {}
    res_biword_index = {}
    stop_words = list(
        list(stopwords.words('english')) + list(string.printable) + list("'s"))

    def __init__(self):
        self.reader = Reader()

    def __create_inverted_index(self):
        for line in self.reader.generate_string_lines():
            words = word_tokenize(line[0])
            from_file = line[1]
            for word in words:
                if word.lower() not in self.stop_words:
                    if word.lower() in self.res_dict:
                        self.res_dict[word.lower()].add(
                            self.reader.file_list.index(from_file))
                    else:
                        self.res_dict[word.lower()] = {
                            self.reader.file_list.index(from_file)
                        }
        return self.res_dict

    def __create_incident_matrix(self):
        inv_index = self.create_or_load_dictionary()
        for k, v in inv_index.items():
            inv_index[k] = self.__to_matrix(v)
        return inv_index

    def __to_matrix(self, list_of_presented_files):
        res_list = []
        for i in enumerate(self.reader.file_list):
            if i[0] in list_of_presented_files:
                res_list.append(1)
            else:
                res_list.append(0)
        return res_list

    def create_or_load_dictionary(self):
        try:
            with open('dict.pickle', "rb") as f:
                foo = pickle.load(f)
        except Exception:
            foo = self.__create_inverted_index()
            with open('dict.pickle', "wb") as f:
                pickle.dump(foo, f)
        return foo

    def create_or_load_matrix(self):
        try:
            with open('matrix.pickle', "rb") as f:
                foo = pickle.load(f)
        except Exception:
            foo = self.__create_incident_matrix()
            with open('matrix.pickle', "wb") as f:
                pickle.dump(foo, f)
        return foo