Ejemplo n.º 1
0
    def init_inverted_index(self):
        for key, _ in self.data.items():
            self.map[key] = []

        for i in range(len(self.book_data)):
            s_dict = S_Dictionary()
            s_dict.load_from_json_file(self.book_data[i])

            for key, _ in self.data.items():
                if key in s_dict.data:
                    self.map[key].append(i)
Ejemplo n.º 2
0
    def build_matrix(self):
        for _ in range(len(self.data)):
            self.matrix.append(list())
        #self.matrix.append([] * len(self.data))

        for file in self.book_data:
            t_dict = S_Dictionary()
            t_dict.load_from_json_file(file)

            for key, _ in self.data.items():
                if key in t_dict.data:
                    self.matrix[self.word_in_int[key]].append(1)
                else:
                    self.matrix[self.word_in_int[key]].append(0)
Ejemplo n.º 3
0
    def parse_file(self, filename):
        #print(filename)
        enc = self.get_file_encoding(filename)
        if enc != 'unknown encoding':
            if filename in self.book_names:
                print("book has already been parsed")
                return None
            self.book_names.append(filename)

            f_dict = S_Dictionary()
            word = ''
            symbol = ''
            position = 1
            with open(filename) as f:
                while True:
                    try:
                        symbol = f.read(1)
                    except:
                        symbol = ' '

                    if not symbol:
                        if len(word) > 1:
                            f_dict.add(word.lower(), position)
                            position += 1
                        break

                    if symbol in self.symbols:
                        word += symbol
                    else:
                        if len(word) > 0:
                            f_dict.add(word.lower(), position)
                            position += 1
                        word = ''

            return f_dict
Ejemplo n.º 4
0
    def parse_file(self, filename):
        #print(filename)
        enc = self.get_file_encoding(filename)
        if enc != 'unknown encoding':
            f_dict = S_Dictionary()
            word = ''
            symbol = ''
            if filename in self.book_names:
                print("book has already been parsed")
                return
            self.book_names.append(filename)
            with open(filename) as f:
                while True:

                    try:
                        symbol = f.read(1)
                    except:
                        symbol = ' '

                    if not symbol:
                        if len(word) > 1:
                            f_dict.add(word.lower())
                            self.add(word.lower())
                        break

                    if symbol in self.symbols:
                        word += symbol
                    else:
                        if len(word) > 0:
                            f_dict.add(word.lower())
                            self.add(word.lower())
                        word = ''

            json_name = "./data/" + filename[8:-3] + "json"
            print("saving in ", json_name, "\n")
            self.book_data.append(json_name)
            f_dict.save_in_json_file(json_name)
Ejemplo n.º 5
0
    def parse_file(self, filename):
        #print(filename)
        enc = self.get_file_encoding(filename)
        has_tags = False
        if enc != 'unknown encoding':
            if filename in self.book_names:
                print("book has already been parsed")
                return None
            self.book_names.append(filename)
            if filename[-4:] == 'fb2':
                has_tags = True
            f_dict = S_Dictionary()
            word = ''
            symbol = ''
            position = 1
            with open(filename, encoding=enc) as f:
                while True:
                    try:
                        symbol = f.read(1)
                    except:
                        symbol = ' '

                    if symbol == '<':
                        tag = ''
                        while symbol != '>' and has_tags:
                            try:
                                symbol = f.read(1)
                                tag += symbol
                            except:
                                break
                        continue

                    if not symbol:
                        if len(word) > 1:
                            f_dict.add(word.lower(), position)
                            position += 1
                        break

                    if symbol in self.symbols:
                        word += symbol
                    else:
                        if len(word) > 0:
                            f_dict.add(word.lower(), position)
                            position += 1
                        word = ''
            return f_dict
Ejemplo n.º 6
0
    def __init__(self):
        S_Dictionary.__init__(self)

        # Bad idea
        # self.delimiters = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '-',
        #               '{', '}', '[', ']', ':', ';', '"', "'", '<', '>', '?', '/', '|', "\\",
        #               ' ', '\n', '\t', '\0', ',', '«', '»', '–', '.', '=', '—', '›', '’',
        #               '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '␜', '‘', '‘', '“', '”', '”'
        #               '\x98', '\xE2', '\x90', '\xBC', '\xa0', '\x91', '\x92', '\x93', '\x94'
        #               ]

        # Better one
        self.symbols = [
            'a',
            'b',
            'c',
            'd',
            'e',
            'f',
            'g',
            'h',
            'i',
            'j',
            'k',
            'l',
            'm',
            'n',
            'o',
            'p',
            'q',
            'r',
            's',
            't',
            'u',
            'v',
            'w',
            'x',
            'y',
            'z',
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'H',
            'I',
            'J',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Z',
            'й',
            'ц',
            'у',
            'к',
            'е',
            'н',
            'г',
            'ш',
            'щ',
            'з',
            'х',
            'ъ',
            'ф',
            'ы',
            'в',
            'а',
            'п',
            'р',
            'о',
            'л',
            'д',
            'ж',
            'э',
            'я',
            'ч',
            'с',
            'м',
            'и',
            'т',
            'ь',
            'б',
            'ю',
            'Й',
            'Ц',
            'У',
            'К',
            'Е',
            'Н',
            'Г',
            'Ш',
            'Щ',
            'З',
            'Х',
            'Ъ',
            'Ф',
            'Ы',
            'В',
            'А',
            'П',
            'Р',
            'О',
            'Л',
            'Д',
            'Ж',
            'Э',
            'Я',
            'Ч',
            'С',
            'М',
            'И',
            'Т',
            'Ь',
            'Б',
            'Ю',
            'Ё',
            'ё',
            'і',
            'І',
            'ї',
            'Ї',
            # numbers may not use
            #'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'
        ]

        self.book_names = []
        self.book_data = []
        print("Parser init")