def main(): old_folder_name = "Test" new_folder_name = old_folder_name + "_copy" os.mkdir(new_folder_name) pool = Pool(5) queue = Manager().Queue() file_list = os.listdir(old_folder_name) file_size = len(file_list) for file in file_list: pool.apply_async(copy_file_worker, (file, old_folder_name, new_folder_name, queue)) num = 0 while num < file_size: queue.get() num += 1 print("Copy Rate: %.2f%%" % (num / file_size * 100)) print("Done!")
class BookSorter: def __init__(self, file_list, mode, database_path, auto_tags=True, temp_dir=None): # Have the GUI pass a list of files straight to here # Then, on the basis of what is needed, pass the # filenames to the requisite functions # This includes getting file info for the database # Parsing for the reader proper # Caching upon closing self.file_list = [i for i in file_list if os.path.exists(i)] self.statistics = [0, (len(file_list))] self.hashes_and_paths = {} self.work_mode = mode[0] self.addition_mode = mode[1] self.database_path = database_path self.auto_tags = auto_tags self.temp_dir = temp_dir if database_path: self.database_hashes() self.threading_completed = [] self.queue = Manager().Queue() self.processed_books = [] if self.work_mode == 'addition': progress_object_generator() def database_hashes(self): all_hashes_and_paths = database.DatabaseFunctions( self.database_path).fetch_data(('Hash', 'Path'), 'books', {'Hash': ''}, 'LIKE') if all_hashes_and_paths: # self.hashes = [i[0] for i in all_hashes] self.hashes_and_paths = {i[0]: i[1] for i in all_hashes_and_paths} def database_entry_for_book(self, file_hash): database_return = database.DatabaseFunctions( self.database_path).fetch_data( ('Title', 'Author', 'Year', 'ISBN', 'Tags', 'Position', 'Bookmarks', 'CoverImage', 'Annotations'), 'books', {'Hash': file_hash}, 'EQUALS')[0] book_data = [] for count, i in enumerate(database_return): if count in ( 5, 6, 8): # Position, Bookmarks, and Annotations are pickled if i: book_data.append(pickle.loads(i)) else: book_data.append(None) else: book_data.append(i) return book_data def read_book(self, filename): # filename is expected as a string containg the # full path of the ebook file with open(filename, 'rb') as current_book: # This should speed up addition for larger files # without compromising the integrity of the process first_bytes = current_book.read(1024 * 32) # First 32KB of the file file_md5 = hashlib.md5(first_bytes).hexdigest() # Update the progress queue self.queue.put(filename) # This should not get triggered in reading mode # IF the file is NOT being loaded into the reader # Do not allow addition in case the file # is already in the database and it remains at its original path if self.work_mode == 'addition' and file_md5 in self.hashes_and_paths: if (self.hashes_and_paths[file_md5] == filename or os.path.exists(self.hashes_and_paths[file_md5])): if not self.hashes_and_paths[file_md5] == filename: print( f'{os.path.basename(filename)} is already in database') return # This allows for eliminating issues with filenames that have # a dot in them. All hail the roundabout fix. valid_extension = False for i in sorter: if os.path.basename(filename).endswith(i): file_extension = i valid_extension = True break if not valid_extension: print(filename + ' has an unsupported extension') return book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) # Everything following this is standard # None values are accounted for here is_valid = book_ref.read_book() if not is_valid: print('Cannot parse: ' + filename) return if book_ref.book: this_book = {} this_book[file_md5] = {'hash': file_md5, 'path': filename} # Different modes require different values if self.work_mode == 'addition': # Reduce the size of the incoming image # if one is found title = book_ref.get_title() author = book_ref.get_author() year = book_ref.get_year() isbn = book_ref.get_isbn() tags = None if self.auto_tags: tags = book_ref.get_tags() cover_image_raw = book_ref.get_cover_image() if cover_image_raw: cover_image = resize_image(cover_image_raw) else: cover_image = None this_book[file_md5]['cover_image'] = cover_image this_book[file_md5]['addition_mode'] = self.addition_mode if self.work_mode == 'reading': all_content = book_ref.get_contents() # get_contents() returns a tuple. Index 1 is a collection of # special settings that depend on the kind of data being parsed. # Currently, this includes: # Only images included images_only BOOL Book contains only images content = all_content[0] images_only = all_content[1]['images_only'] if not content: content = [('Invalid', 'Something went horribly wrong')] book_data = self.database_entry_for_book(file_md5) title = book_data[0] author = book_data[1] year = book_data[2] isbn = book_data[3] tags = book_data[4] position = book_data[5] bookmarks = book_data[6] cover = book_data[7] annotations = book_data[8] this_book[file_md5]['position'] = position this_book[file_md5]['bookmarks'] = bookmarks this_book[file_md5]['content'] = content this_book[file_md5]['images_only'] = images_only this_book[file_md5]['cover'] = cover this_book[file_md5]['annotations'] = annotations this_book[file_md5]['title'] = title this_book[file_md5]['author'] = author this_book[file_md5]['year'] = year this_book[file_md5]['isbn'] = isbn this_book[file_md5]['tags'] = tags return this_book def read_progress(self): while True: processed_file = self.queue.get() self.threading_completed.append(processed_file) total_number = len(self.file_list) completed_number = len(self.threading_completed) if progress_emitter: # Skip update in reading mode progress_emitter.update_progress(completed_number * 100 // total_number) if total_number == completed_number: break def initiate_threads(self): if not self.file_list: return None def pool_creator(): _pool = Pool(5) self.processed_books = _pool.map(self.read_book, self.file_list) _pool.close() _pool.join() start_time = time.time() worker_thread = threading.Thread(target=pool_creator) progress_thread = threading.Thread(target=self.read_progress) worker_thread.start() progress_thread.start() worker_thread.join() progress_thread.join(timeout=.5) return_books = {} # Exclude None returns generated in case of duplication / parse errors self.processed_books = [i for i in self.processed_books if i] for i in self.processed_books: for j in i: return_books[j] = i[j] del self.processed_books print('Finished processing in', time.time() - start_time) return return_books
class BookSorter: def __init__(self, file_list, mode, database_path, settings, temp_dir=None): # Have the GUI pass a list of files straight to here # Then, on the basis of what is needed, pass the # filenames to the requisite functions # This includes getting file info for the database # Parsing for the reader proper # Caching upon closing self.file_list = [i for i in file_list if os.path.exists(i)] self.statistics = [0, (len(file_list))] self.hashes_and_paths = {} self.work_mode = mode[0] self.addition_mode = mode[1] self.database_path = database_path self.auto_tags = settings['auto_tags'] self.auto_cover = settings['auto_cover'] self.temp_dir = temp_dir if database_path: self.database_hashes() self.threading_completed = [] self.queue = Manager().Queue() self.errors = Manager().list() self.processed_books = [] if self.work_mode == 'addition': progress_object_generator() def database_hashes(self): all_hashes_and_paths = database.DatabaseFunctions( self.database_path).fetch_data(('Hash', 'Path'), 'books', {'Hash': ''}, 'LIKE') if all_hashes_and_paths: self.hashes_and_paths = {i[0]: i[1] for i in all_hashes_and_paths} def database_entry_for_book(self, file_hash): database_return = database.DatabaseFunctions( self.database_path).fetch_data( ('Title', 'Author', 'Year', 'ISBN', 'Tags', 'Position', 'Bookmarks', 'CoverImage', 'Annotations'), 'books', {'Hash': file_hash}, 'EQUALS')[0] book_data = [] for count, i in enumerate(database_return): if count in ( 5, 6, 8): # Position, Bookmarks, and Annotations are pickled if i: book_data.append(pickle.loads(i)) else: book_data.append(None) else: book_data.append(i) return book_data def read_book(self, filename): # filename is expected as a string containg the # full path of the ebook file with open(filename, 'rb') as current_book: # This should speed up addition for larger files # without compromising the integrity of the process first_bytes = current_book.read(1024 * 32) # First 32KB of the file file_md5 = hashlib.md5(first_bytes).hexdigest() # Update the progress queue self.queue.put(filename) # This should not get triggered in reading mode # IF the file is NOT being loaded into the reader # Do not allow addition in case the file # is already in the database and it remains at its original path if self.work_mode == 'addition' and file_md5 in self.hashes_and_paths: if (self.hashes_and_paths[file_md5] == filename or os.path.exists(self.hashes_and_paths[file_md5])): if not self.hashes_and_paths[file_md5] == filename: warning_string = ( f'{os.path.basename(filename)} is already in database') logger.warning(warning_string) return # This allows for eliminating issues with filenames that have # a dot in them. All hail the roundabout fix. valid_extension = False for i in sorter: if os.path.basename(filename).endswith(i): file_extension = i valid_extension = True break if not valid_extension: this_error = 'Unsupported extension: ' + filename self.errors.append(this_error) logger.error(this_error) return book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) # None of the following have an exception type specified # This will keep everything from crashing, but will make # troubleshooting difficult # TODO # In application notifications try: book_ref.read_book() except Exception as e: this_error = f'Error initializing: {filename}' self.errors.append(this_error) logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}') return this_book = {} this_book[file_md5] = {'hash': file_md5, 'path': filename} # Different modes require different values if self.work_mode == 'addition': try: metadata = book_ref.generate_metadata() except Exception as e: this_error = f'Metadata generation error: {filename}' self.errors.append(this_error) logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}') return title = metadata.title author = metadata.author year = metadata.year isbn = metadata.isbn tags = None if self.auto_tags: tags = metadata.tags cover_image_raw = metadata.cover if cover_image_raw: cover_image = resize_image(cover_image_raw) else: cover_image = None if self.auto_cover: cover_image = fetch_cover(title, author) this_book[file_md5]['cover_image'] = cover_image this_book[file_md5]['addition_mode'] = self.addition_mode if self.work_mode == 'reading': try: book_breakdown = book_ref.generate_content() except Exception as e: this_error = f'Content generation error: {filename}' self.errors.append(this_error) logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}') return toc = book_breakdown[0] content = book_breakdown[1] images_only = book_breakdown[2] try: book_data = self.database_entry_for_book(file_md5) except TypeError: logger.error( f'Database error: {filename}. Re-add book to program') return title = book_data[0].replace('&', '&&') author = book_data[1] year = book_data[2] isbn = book_data[3] tags = book_data[4] position = book_data[5] bookmarks = book_data[6] cover = book_data[7] annotations = book_data[8] this_book[file_md5]['position'] = position this_book[file_md5]['bookmarks'] = bookmarks this_book[file_md5]['toc'] = toc this_book[file_md5]['content'] = content this_book[file_md5]['images_only'] = images_only this_book[file_md5]['cover'] = cover this_book[file_md5]['annotations'] = annotations this_book[file_md5]['title'] = title this_book[file_md5]['author'] = author this_book[file_md5]['year'] = year this_book[file_md5]['isbn'] = isbn this_book[file_md5]['tags'] = tags return this_book def read_progress(self): while True: processed_file = self.queue.get() self.threading_completed.append(processed_file) total_number = len(self.file_list) completed_number = len(self.threading_completed) # Just for the record, this slows down book searching by about 20% if _progress_emitter: # Skip update in reading mode _progress_emitter.update_progress(completed_number * 100 // total_number) if total_number == completed_number: break def initiate_threads(self): if not self.file_list: return None def pool_creator(): _pool = Pool(thread_count) self.processed_books = _pool.map(self.read_book, self.file_list) _pool.close() _pool.join() start_time = time.time() worker_thread = threading.Thread(target=pool_creator) progress_thread = threading.Thread(target=self.read_progress) worker_thread.start() progress_thread.start() worker_thread.join() progress_thread.join(timeout=.5) return_books = {} # Exclude None returns generated in case of duplication / parse errors self.processed_books = [i for i in self.processed_books if i] for i in self.processed_books: for j in i: return_books[j] = i[j] del self.processed_books processing_time = str(time.time() - start_time) logger.info('Finished processing in ' + processing_time) return return_books, self.errors
class BookSorter: def __init__(self, file_list, mode, database_path, settings, temp_dir=None): # Have the GUI pass a list of files straight to here # Then, on the basis of what is needed, pass the # filenames to the requisite functions # This includes getting file info for the database # Parsing for the reader proper # Caching upon closing self.file_list = [i for i in file_list if os.path.exists(i)] self.statistics = [0, (len(file_list))] self.hashes_and_paths = {} self.work_mode = mode[0] self.addition_mode = mode[1] self.database_path = database_path self.auto_tags = settings['auto_tags'] self.auto_cover = settings['auto_cover'] self.temp_dir = temp_dir if database_path: self.database_hashes() self.threading_completed = [] self.queue = Manager().Queue() self.errors = Manager().list() self.processed_books = [] if self.work_mode == 'addition': progress_object_generator() def database_hashes(self): all_hashes_and_paths = database.DatabaseFunctions( self.database_path).fetch_data( ('Hash', 'Path'), 'books', {'Hash': ''}, 'LIKE') if all_hashes_and_paths: self.hashes_and_paths = { i[0]: i[1] for i in all_hashes_and_paths} def database_entry_for_book(self, file_hash): database_return = database.DatabaseFunctions( self.database_path).fetch_data( ('Title', 'Author', 'Year', 'ISBN', 'Tags', 'Position', 'Bookmarks', 'CoverImage', 'Annotations'), 'books', {'Hash': file_hash}, 'EQUALS')[0] book_data = [] for count, i in enumerate(database_return): if count in (5, 6, 8): # Position, Bookmarks, and Annotations are pickled if i: book_data.append(pickle.loads(i)) else: book_data.append(None) else: book_data.append(i) return book_data def read_book(self, filename): # filename is expected as a string containg the # full path of the ebook file with open(filename, 'rb') as current_book: # This should speed up addition for larger files # without compromising the integrity of the process first_bytes = current_book.read(1024 * 32) # First 32KB of the file file_md5 = hashlib.md5(first_bytes).hexdigest() # Update the progress queue self.queue.put(filename) # This should not get triggered in reading mode # IF the file is NOT being loaded into the reader # Do not allow addition in case the file # is already in the database and it remains at its original path if self.work_mode == 'addition' and file_md5 in self.hashes_and_paths: if (self.hashes_and_paths[file_md5] == filename or os.path.exists(self.hashes_and_paths[file_md5])): if not self.hashes_and_paths[file_md5] == filename: warning_string = ( f'{os.path.basename(filename)} is already in database') logger.warning(warning_string) return # This allows for eliminating issues with filenames that have # a dot in them. All hail the roundabout fix. valid_extension = False for i in sorter: if os.path.basename(filename).endswith(i): file_extension = i valid_extension = True break if not valid_extension: this_error = 'Unsupported extension: ' + filename self.errors.append(this_error) logger.error(this_error) return book_ref = sorter[file_extension](filename, self.temp_dir, file_md5) # None of the following have an exception type specified # This will keep everything from crashing, but will make # troubleshooting difficult # TODO # In application notifications try: book_ref.read_book() except Exception as e: this_error = f'Error initializing: {filename}' self.errors.append(this_error) logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}') return this_book = {} this_book[file_md5] = { 'hash': file_md5, 'path': filename} # Different modes require different values if self.work_mode == 'addition': try: metadata = book_ref.generate_metadata() except Exception as e: this_error = f'Metadata generation error: {filename}' self.errors.append(this_error) logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}') return title = metadata.title author = metadata.author year = metadata.year isbn = metadata.isbn tags = None if self.auto_tags: tags = metadata.tags cover_image_raw = metadata.cover if cover_image_raw: cover_image = resize_image(cover_image_raw) else: cover_image = None if self.auto_cover: cover_image = fetch_cover(title, author) this_book[file_md5]['cover_image'] = cover_image this_book[file_md5]['addition_mode'] = self.addition_mode if self.work_mode == 'reading': try: book_breakdown = book_ref.generate_content() except Exception as e: this_error = f'Content generation error: {filename}' self.errors.append(this_error) logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}') return toc = book_breakdown[0] content = book_breakdown[1] images_only = book_breakdown[2] try: book_data = self.database_entry_for_book(file_md5) except TypeError: logger.error( f'Database error: {filename}. Re-add book to program') return title = book_data[0].replace('&', '&&') author = book_data[1] year = book_data[2] isbn = book_data[3] tags = book_data[4] position = book_data[5] bookmarks = book_data[6] cover = book_data[7] annotations = book_data[8] this_book[file_md5]['position'] = position this_book[file_md5]['bookmarks'] = bookmarks this_book[file_md5]['toc'] = toc this_book[file_md5]['content'] = content this_book[file_md5]['images_only'] = images_only this_book[file_md5]['cover'] = cover this_book[file_md5]['annotations'] = annotations this_book[file_md5]['title'] = title this_book[file_md5]['author'] = author this_book[file_md5]['year'] = year this_book[file_md5]['isbn'] = isbn this_book[file_md5]['tags'] = tags return this_book def read_progress(self): while True: processed_file = self.queue.get() self.threading_completed.append(processed_file) total_number = len(self.file_list) completed_number = len(self.threading_completed) # Just for the record, this slows down book searching by about 20% if _progress_emitter: # Skip update in reading mode _progress_emitter.update_progress( completed_number * 100 // total_number) if total_number == completed_number: break def initiate_threads(self): if not self.file_list: return None def pool_creator(): _pool = Pool(thread_count) self.processed_books = _pool.map( self.read_book, self.file_list) _pool.close() _pool.join() start_time = time.time() worker_thread = threading.Thread(target=pool_creator) progress_thread = threading.Thread(target=self.read_progress) worker_thread.start() progress_thread.start() worker_thread.join() progress_thread.join(timeout=.5) return_books = {} # Exclude None returns generated in case of duplication / parse errors self.processed_books = [i for i in self.processed_books if i] for i in self.processed_books: for j in i: return_books[j] = i[j] del self.processed_books processing_time = str(time.time() - start_time) logger.info('Finished processing in ' + processing_time) return return_books, self.errors
return True # return "done" def _upload_pool(self): pool = ThreadPool(processes=self.n_treads) res = pool.map(self._upload_file, self.file_list) n_res = len(res) res_dict = Counter(res) succes = res_dict[True] self.q.put("Uploaded {}/{}".format(succes, n_res)) self.is_active = False def start(self): proc = Process(target=self._upload_pool) proc.start() if __name__ == '__main__': q = Manager().Queue() files_list = [ os.path.join(os.getcwd(), "test_data", i) for i in os.listdir("./test_data") ] uploader = Uploader(files_list, 2, q) uploader.start() while uploader.is_active: progress = q.get() print(progress)