Exemple #1
0
    def indexing(self,
                 files,
                 ignore_case=True,
                 ignore_stop_words=True,
                 stemming=True,
                 use_weights=True,
                 title_weight=5,
                 date_weight=2,
                 memory_limit=50,
                 use_vbytes=True):
        """
        Launch the indexing of a list of files
        :param files: the paths to the files to index
        :param ignore_case: should case be ignored in the indexing ?
        :param ignore_stop_words: should stop words be ignored ?
        :param stemming: should we stemm the tokens ?
        :param use_weights: shoud we differenciate word with their position in the document ?
        :param title_weight: weight for words in title
        :param date_weight: weight for words in the date
        :param memory_limit: limit on the memory before a flush in a temp file
        :param use_vbytes: usage of variable bytes for the final posting list ?
        :return: when the indexing is finished
        """

        SC.new_indexing()

        documents = []

        self.current_status = "Indexing - Starting"

        self.__id_to_filename = SortedDict()

        self.inv_file = InvertedFile(use_vbytes, memory_limit)
        for file in files:
            self.current_status = "Indexing - {}".format(file)
            file_docs = Reader.read_file(file, ignore_case, ignore_stop_words,
                                         stemming, use_weights, title_weight,
                                         date_weight)
            for doc in file_docs:
                self.__id_to_filename[int(doc.doc_id())] = file
                self.inv_file.add_document(doc)

        self.current_status = "Indexing - Making the inverted file"

        self.inv_file.gen_pl_file()

        self.current_status = "Indexing - Saving to pickle file"

        with open(self.PICKLES[0], "wb") as file:
            pickle.dump(self.inv_file, file)
        with open(self.PICKLES[1], "wb") as file:
            pickle.dump(self.__id_to_filename, file)

        self.current_status = "Indexing - Finished - You can query"

        SC.last_indexing().stop()
        SC.last_indexing().log(files, ignore_case, ignore_stop_words, stemming,
                               use_weights, title_weight, date_weight,
                               memory_limit, use_vbytes)