Example #1
0
    def to_txt(self, job):
        filename, cmds, date, md5 = job
        try:
            text, extension = Process().process(
                cmds, filenames=[str(filename)], get_content=True,
            )
            if text is None:
                text = ''

            self.lock.acquire()

            self.no += 1
            self.update_library_progress.emit(
                self.no * 100 / self.nb, 'Parsing the files %i/%i.' % (self.no, self.nb),
                edocuments.short_path(filename),
            )
            print("%i/%i" % (self.no, self.nb))

            if text is False:
                print("Error with document: " + filename)
                self.nb_error += 1
            else:
                index().add(
                    filename,
                    "%s\n%s" % (filename, text),
                    date, md5
                )

            self.lock.release()
        except:
            traceback.print_exc()
            return filename, False
Example #2
0
    def scan_browse(self, event):
        filename = QFileDialog.getSaveFileName(
            self, "Scan to", directory=self.filename()
        )[0]
        filename = re.sub(r"\.[a-z0-9A-Z]{2,5}$", "", filename)

        filename = edocuments.short_path(filename)
        self.ui.scan_to.setText(filename)
Example #3
0
 def add(self, filename, text, date, md5):
     filename = edocuments.short_path(filename)
     with self.index.writer() as writer:
         writer.update_document(**{
             PATH: filename,
             CONTENT: text,
             DATE: date,
             DIRECTORY: False,
         })
Example #4
0
    def get(self, filename):
        filename = edocuments.short_path(filename)
        with self.index.searcher() as searcher:
            results = searcher.search(Term("path_id", filename))
            if len(results) == 0:
                return None
            assert(len(results) == 1)

            result = {}
            for field in self.index.schema.names():
                result[field] = results[0].get(field)

            return result
Example #5
0
    def do_update_library(self):
        docs_to_rm = []
        docs_date = {}
        with index().index.reader() as reader:
            for num, doc in reader.iter_docs():
                if \
                        doc[PATH] in docs_date or \
                        not Path(edocuments.long_path(doc[PATH])).exists() or \
                        doc[PATH] != edocuments.short_path(doc[PATH]):
                    print("Delete document: " + doc[PATH])
                    docs_to_rm.append(num)
                else:
                    docs_date[doc[PATH]] = (doc.get(DATE), doc.get(MD5))

        self.update_library_progress.emit(
            0, 'Adding the directories...', '')
        index_folder = '.index'
        for directory in Path(edocuments.root_folder).rglob('*'):
            dir_ = edocuments.short_path(directory)
            if \
                    dir_ not in docs_date and \
                    directory.is_dir() and \
                    directory != index_folder:
                ignore = False
                for ignore_pattern in edocuments.config.get('ignore', []):
                    if directory.match(ignore_pattern):
                        ignore = False
                        break
                if not ignore:
                    with index().index.writer() as writer:
                        writer.update_document(**{
                            PATH: dir_,
                            CONTENT: dir_,
                            DATE: directory.stat().st_mtime,
                            DIRECTORY: True,
                        })

        self.update_library_progress.emit(
            0, 'Browsing the files (0)...', '')
        index_folder += '/'
        todo = []
        for conv in edocuments.config.get('to_txt'):
            cmds = conv.get("cmds")
            for filename in Path(edocuments.root_folder).rglob(
                    "*." + conv.get('extension')):
                ignore = False
                for ignore_pattern in edocuments.config.get('ignore', []):
                    if directory.match(ignore_pattern):
                        ignore = False
                        break
                if not ignore and filename.exists() and str(filename).find(index_folder) != 0:
                    current_date, md5 = docs_date.get(edocuments.short_path(filename), (None, None))
                    new_date = filename.stat().st_mtime
                    new_md5 = hashlib.md5()
                    with open(str(filename), "rb") as f:
                        for chunk in iter(lambda: f.read(4096), b""):
                            new_md5.update(chunk)

                    if current_date is None or new_date > current_date:
                        if current_date is not None and (md5 is None or md5 == new_md5.hexdigest()):
                            doc = index().get(filename)
                            index().add(
                                filename,
                                doc[CONTENT],
                                max(new_date, current_date),
                                new_md5.hexdigest()
                            )
                        else:
                            print("Add document: " + edocuments.short_path(filename))
                            todo.append((str(filename), cmds, new_date, new_md5.hexdigest()))
                            self.update_library_progress.emit(
                                0, 'Browsing the files (%i)...' % len(todo), edocuments.short_path(filename))

        self.nb = len(todo)
        self.nb_error = 0
        self.no = 0

        print('Removes %i old documents.' % len(docs_to_rm))

        with index().index.writer() as writer:
            for num in docs_to_rm:
                writer.delete_document(num)

        self.update_library_progress.emit(
            0, 'Parsing the files %i/%i.' % (self.no, self.nb), '',
        )

        print('Process %i documents.' % len(todo))

        with ThreadPoolExecutor(
            max_workers=edocuments.config.get('nb_process', 8)
        ) as executor:
            future_results = {
                executor.submit(self.to_txt, t):
                t for t in todo
            }
            for feature in as_completed(future_results):
                pass

        self.update_library_progress.emit(
            0, 'Optimise the index...', '',
        )
        index().optimize()

        if self.nb_error != 0:
            self.scan_error.emit("Finished with %i errors" % self.nb_error)
        else:
            self.update_library_progress.emit(
                100, 'Finish', '',
            )