class Preprocessor(object):
    def __init__(self, source_filename, target_filename, include_filenames):
        self.reader = Reader(source_filename)
        for filename in include_filenames:
            self.reader.include(filename)
        self.writer = Writer(target_filename)
        self.defines = {}
        self.lines = self.reader.get_lines() # это генератор
        
    def define(self, line):
        pass
    
    def include(self, line):
        pass
    
    def process(self, line):
        return True
        
    def run(self):
        for line in self.lines:
            if self.process(line):
                self.writer.write_line(line)
    
    def close(self):
        self.reader.close()
        self.writer.close()
 def __init__(self, source_filename, target_filename, include_filenames):
     self.reader = Reader(source_filename)
     for filename in include_filenames:
         self.reader.include(filename)
     self.writer = Writer(target_filename)
     self.defines = {}
     self.lines = self.reader.get_lines() # это генератор
Exemple #3
0
    def indexing(self,
                 files,
                 ignore_case=True,
                 ignore_stop_words=True,
                 stemming=True,
                 use_weights=True,
                 title_weight=5,
                 date_weight=2,
                 memory_limit=50,
                 use_vbytes=True):
        """
        Launch the indexing of a list of files
        :param files: the paths to the files to index
        :param ignore_case: should case be ignored in the indexing ?
        :param ignore_stop_words: should stop words be ignored ?
        :param stemming: should we stemm the tokens ?
        :param use_weights: shoud we differenciate word with their position in the document ?
        :param title_weight: weight for words in title
        :param date_weight: weight for words in the date
        :param memory_limit: limit on the memory before a flush in a temp file
        :param use_vbytes: usage of variable bytes for the final posting list ?
        :return: when the indexing is finished
        """

        SC.new_indexing()

        documents = []

        self.current_status = "Indexing - Starting"

        self.__id_to_filename = SortedDict()

        self.inv_file = InvertedFile(use_vbytes, memory_limit)
        for file in files:
            self.current_status = "Indexing - {}".format(file)
            file_docs = Reader.read_file(file, ignore_case, ignore_stop_words,
                                         stemming, use_weights, title_weight,
                                         date_weight)
            for doc in file_docs:
                self.__id_to_filename[int(doc.doc_id())] = file
                self.inv_file.add_document(doc)

        self.current_status = "Indexing - Making the inverted file"

        self.inv_file.gen_pl_file()

        self.current_status = "Indexing - Saving to pickle file"

        with open(self.PICKLES[0], "wb") as file:
            pickle.dump(self.inv_file, file)
        with open(self.PICKLES[1], "wb") as file:
            pickle.dump(self.__id_to_filename, file)

        self.current_status = "Indexing - Finished - You can query"

        SC.last_indexing().stop()
        SC.last_indexing().log(files, ignore_case, ignore_stop_words, stemming,
                               use_weights, title_weight, date_weight,
                               memory_limit, use_vbytes)