def to_txt(self, job): filename, cmds, date, md5 = job try: text, extension = Process().process( cmds, filenames=[str(filename)], get_content=True, ) if text is None: text = '' self.lock.acquire() self.no += 1 self.update_library_progress.emit( self.no * 100 / self.nb, 'Parsing the files %i/%i.' % (self.no, self.nb), edocuments.short_path(filename), ) print("%i/%i" % (self.no, self.nb)) if text is False: print("Error with document: " + filename) self.nb_error += 1 else: index().add( filename, "%s\n%s" % (filename, text), date, md5 ) self.lock.release() except: traceback.print_exc() return filename, False
def reset_library(self): msg = QMessageBox(self) msg.setWindowTitle("Reset the library...") msg.setInformativeText("Are you sure to reset all you index?") msg.setStandardButtons(QMessageBox.Ok | QMessageBox.Cancel) if msg.exec() == QMessageBox.Ok: index().clear()
def do_scan(self, filename, cmds, postprocess): try: filename, extension = self.process.process( cmds, destination_filename=filename, ) except: traceback.print_exc() self.scan_error.emit(str(sys.exc_info()[1])) raise if filename is None: return self.scan_end.emit(filename) try: filename, extension = Process().process( postprocess, filenames=filename, in_extention=extension, ) conv = [ c for c in edocuments.config.get('to_txt') if c['extension'] == extension ] if len(conv) >= 1: conv = conv[0] cmds = conv.get("cmds") try: text, extension = Process().process( cmds, filenames=filename, get_content=True, ) new_md5 = hashlib.md5() new_date = Path(filename).stat().st_mtime with open(str(filename), "rb") as f: for chunk in iter(lambda: f.read(4096), b""): new_md5.update(chunk) index().add( filename, text, new_date, new_md5.hexdigest() ) except: traceback.print_exc() self.scan_error.emit(str(sys.exc_info()[1])) raise except: traceback.print_exc() self.scan_error.emit(str(sys.exc_info()[1])) raise
def search(self, text): model = self.ui.search_result_list.model() model.removeRows(0, model.rowCount()) raw_results = index().search(self.ui.search_text.text()) dirs = dict([ (r.get('path'), -1) for r in raw_results if r.get('directory') ]) results = {} for index_, result in enumerate(raw_results): path_ = result.get('path') dir_ = os.path.dirname(path_) if dir_ not in dirs: results[path_] = [result, float(index_) / len(raw_results)] else: dirs[dir_] += 1 for dir_, count in dirs.items(): if dir_ in results: results[dir_][1] += count results = sorted(results.values(), key=lambda x: -x[1]) for result, count in results: postfix = ' (%i)' % (count + 1) if result.get('directory') else '' item = QListWidgetItem( result['path'] + postfix, self.ui.search_result_list ) item.result = result
def do_update_library(self): docs_to_rm = [] docs_date = {} with index().index.reader() as reader: for num, doc in reader.iter_docs(): if \ doc[PATH] in docs_date or \ not Path(edocuments.long_path(doc[PATH])).exists() or \ doc[PATH] != edocuments.short_path(doc[PATH]): print("Delete document: " + doc[PATH]) docs_to_rm.append(num) else: docs_date[doc[PATH]] = (doc.get(DATE), doc.get(MD5)) self.update_library_progress.emit( 0, 'Adding the directories...', '') index_folder = '.index' for directory in Path(edocuments.root_folder).rglob('*'): dir_ = edocuments.short_path(directory) if \ dir_ not in docs_date and \ directory.is_dir() and \ directory != index_folder: ignore = False for ignore_pattern in edocuments.config.get('ignore', []): if directory.match(ignore_pattern): ignore = False break if not ignore: with index().index.writer() as writer: writer.update_document(**{ PATH: dir_, CONTENT: dir_, DATE: directory.stat().st_mtime, DIRECTORY: True, }) self.update_library_progress.emit( 0, 'Browsing the files (0)...', '') index_folder += '/' todo = [] for conv in edocuments.config.get('to_txt'): cmds = conv.get("cmds") for filename in Path(edocuments.root_folder).rglob( "*." + conv.get('extension')): ignore = False for ignore_pattern in edocuments.config.get('ignore', []): if directory.match(ignore_pattern): ignore = False break if not ignore and filename.exists() and str(filename).find(index_folder) != 0: current_date, md5 = docs_date.get(edocuments.short_path(filename), (None, None)) new_date = filename.stat().st_mtime new_md5 = hashlib.md5() with open(str(filename), "rb") as f: for chunk in iter(lambda: f.read(4096), b""): new_md5.update(chunk) if current_date is None or new_date > current_date: if current_date is not None and (md5 is None or md5 == new_md5.hexdigest()): doc = index().get(filename) index().add( filename, doc[CONTENT], max(new_date, current_date), new_md5.hexdigest() ) else: print("Add document: " + edocuments.short_path(filename)) todo.append((str(filename), cmds, new_date, new_md5.hexdigest())) self.update_library_progress.emit( 0, 'Browsing the files (%i)...' % len(todo), edocuments.short_path(filename)) self.nb = len(todo) self.nb_error = 0 self.no = 0 print('Removes %i old documents.' % len(docs_to_rm)) with index().index.writer() as writer: for num in docs_to_rm: writer.delete_document(num) self.update_library_progress.emit( 0, 'Parsing the files %i/%i.' % (self.no, self.nb), '', ) print('Process %i documents.' % len(todo)) with ThreadPoolExecutor( max_workers=edocuments.config.get('nb_process', 8) ) as executor: future_results = { executor.submit(self.to_txt, t): t for t in todo } for feature in as_completed(future_results): pass self.update_library_progress.emit( 0, 'Optimise the index...', '', ) index().optimize() if self.nb_error != 0: self.scan_error.emit("Finished with %i errors" % self.nb_error) else: self.update_library_progress.emit( 100, 'Finish', '', )
def optimize_library(self): index().optimize()