def delete_removed_from_local_db(google, local_path, dry_run=False): print("Deleting files removed from database from Google Drive ({}) ...".format(local_path) + (" (dry)" if dry_run else "")) db = database.GoogleDriveDB() archives = [] q = db.model.select().where(db.model.path.startswith(local_path)) for archive in q.namedtuples().iterator(): if not os.path.exists(archive.path): archives.append(archive) removed = set() for archive in archives: if dry_run: print(archive.path, archive.drive_id) continue # Minimizing GD API calls is key for speed. if archive.drive_id in removed: continue print(archive.path, archive.drive_id) google.delete(archive.drive_id) # If this is a folder, then all the children will get removed as well. q = db.model.select().where(db.model.path.startswith(archive.path)) for arch in q.iterator(): removed.add(arch.drive_id) arch.delete_instance() logging.info("Removed {} ({}) from database and Google Drive.".format(arch.drive_id, arch.path)) db.close()
def get_blacklisted_archives(): SETTINGS_FILE = "_settings.ini" DATA_FILE = "_backuper.ini" conf = settings.Settings(SETTINGS_FILE, DATA_FILE) with database.GoogleDriveDB() as db: for archive in db.model.select().iterator(): if conf.is_blacklisted_parent(archive.path, conf.sync_dirs): yield archive
def delete_nonlocal_in_gd(google, folder_id, dry_run=False): """Removes all files in the given remote folder that don't exist locally. SLOW. """ print("Deleting nonlocal files from Google Drive ({}) ...".format(folder_id) + (" (dry)" if dry_run else "")) db = database.GoogleDriveDB() # Because the remote folder can contain files not in the database, # we have no choice but to walk through it. removed_archived = [] removed_unarchived = [] for dirpath, dirnames, filenames in google.walk_folder(folder_id): remote_path, file_id = dirpath # print(remote_path) archive = db.get("drive_id", file_id) if archive: if not os.path.exists(archive.path): removed_archived.append(archive) dirnames.clear() continue else: # A file that exists remotely but isn't in the database # is definitely safe to remove. # If a file exists with the same path as a local file, # it should still get removed because the local file will # get uploaded and put in the database. removed_unarchived.append((file_id, remote_path)) dirnames.clear() continue for resp in filenames: file_id = resp["id"] archive = db.get("drive_id", file_id) if archive: if not os.path.exists(archive.path): removed_archived.append(archive) else: name = resp["name"] removed_unarchived.append((file_id, os.path.join(remote_path, name))) # Database entries are removed because the files in question exist # neither locally nor remotely. for archive in removed_archived: # If a folder got removed, all children got removed as well. if dry_run: print(archive.drive_id, archive.path) else: google.delete(archive.drive_id) logging.info("Removed {} ({}) from database and Google Drive.".format(archive.drive_id, archive.path)) q = db.model.delete().where(db.model.path.startswith(archive.path)) q.execute() for file_id, remote_path in removed_unarchived: if dry_run: print(file_id, remote_path) else: google.delete(file_id) # logging.info in google.delete db.close()
def test_drivecrawler_folder(folder_id): db = database.GoogleDriveDB() conf = settings.Settings(SETTINGS_FILE, DATA_FILE) crawler = filecrawler.DriveFileCrawler(conf, googledrive.GoogleDrive()) for obj in crawler.get_ids_to_download_in_folder(folder_id): print(obj) conf.exit() db.close()
def remove_blacklisted_paths(google): """Removes archived blacklisted paths from Google Drive and the database.""" print("Deleting blacklisted files from Google Drive ...") db = database.GoogleDriveDB() archives = list(get_blacklisted_archives()) for archive in progressbar.progressbar(archives): google.delete(archive.drive_id) logging.info("Removed {} ({}) from database and/or Google Drive.".format(archive.drive_id, archive.path)) archive.delete_instance() db.close()
def remove_gd_nonexistent_from_db(google, config): """Rebuild database by removing non-existent files in Google Drive from the database archive. Used for maintenance. """ print("Removing non-existent files in Google Drive from the database ...") logging.info("remove_gd_nonexistent_from_db()") with database.GoogleDriveDB() as db: for archive in db.model.select().iterator(): if google.exists(archive.drive_id): continue if not os.path.exists(archive.path) or config.is_blacklisted(archive.path): logging.info("Removed {} from database.".format(archive.path)) archive.delete_instance()
def get_unarchived_files_in_google_drive(google, folder_id): db = database.GoogleDriveDB() for dirpath, dirnames, filenames in google.walk_folder(folder_id, fields="files(id,name)"): file_id = dirpath[1] archive = db.get("drive_id", file_id) if archive is None: yield file_id for resp in filenames: file_id = resp["id"] archive = db.get("drive_id", file_id) if archive is None: yield file_id db.close()
def test_localfilecrawler(): db = database.GoogleDriveDB() conf = settings.Settings(SETTINGS_FILE, DATA_FILE) crawler = filecrawler.LocalFileCrawler(conf) for p in crawler.get_folders_to_sync("tests/"): print(p) for p in crawler.get_files_to_sync("tests/"): print(p) for p in crawler.get_all_paths_to_sync("tests/"): print(p) conf.exit() db.close()
def test_drivecrawler_changes(): db = database.GoogleDriveDB() conf = settings.Settings(SETTINGS_FILE, DATA_FILE) g = googledrive.GoogleDrive() crawler = filecrawler.DriveFileCrawler(conf, g) change_date = datetime.datetime(2019, 5, 20) change_date = googledrive.convert_datetime_to_google_time(change_date) conf.data_file.set_last_download_sync_time(change_date) conf.data_file.set_last_download_change_token(989626) for obj in crawler.get_changes_to_download(): print(obj) g.exit() conf.exit() db.close()
def delete_all_removed_from_local_db(google): """:WARNING: Delete files removed from disk from Google Drive and the database.""" print("Deleting files removed from disk from Google Drive ...") # 403 errors are the enemy. There is nothing that can be done. # The official recommended strategy, exponential backoff doesn't work. # Requests must be intentionally throtteled to avoid getting stuck in a 403 loop. # For that reason, this function is single threaded and unbatched. db = database.GoogleDriveDB() archives = list(get_all_removed_from_local_db()) for archive in progressbar.progressbar(archives): google.delete(archive.drive_id) logging.info("Removed {} ({}) from database and/or Google Drive.".format(archive.drive_id, archive.path)) archive.delete_instance() db.close()
def delete_all_removed_from_local_db_batched(google): """:WARNING: Delete files removed from disk from Google Drive and the database.""" print("\rDeleting files removed from disk from Google Drive ...") RETRY_LIMIT = 5 db = database.GoogleDriveDB() ids = { rem.drive_id for rem in get_all_removed_from_local_db() } retry_ids = set() retry_count = 0 pbar = progressbar.progressbar(total=len(ids)) def _batch_delete_callback(file_id, _, exception): nonlocal retry_count if exception is not None and exception.resp.status != 404: if exception.resp.status == 403: # Rate limit exceeded (probably). if retry_count >= RETRY_LIMIT: raise exception logging.warning("RETRYING:" + repr(exception)) retry_ids.add(file_id) time.sleep(2**retry_count) retry_count += 1 else: raise exception else: if exception is not None and exception.resp.status == 404: # File does not exist. logging.warning("IGNORING: " + repr(exception)) retry_count = 0 archive = db.get("drive_id", file_id) pbar.update() logging.info("Removed {} ({}) from database and/or Google Drive.".format(archive.drive_id, archive.path)) archive.delete_instance() google.batch_delete(ids, callback=_batch_delete_callback) while len(retry_ids) > 0: ids = set(retry_ids) retry_ids.clear() google.batch_delete(ids, callback=_batch_delete_callback) db.close()
def db_upload_test(path): db = database.GoogleDriveDB() conf = settings.Settings(SETTINGS_FILE, DATA_FILE) google = googledrive.GoogleDrive() file_crawler = filecrawler.LocalFileCrawler(conf) drive_uploader = uploader.DBDriveUploader(google, update_db=True) folder_id = make_folder_structure(path, drive_uploader, file_crawler) q = drive_uploader.start_upload_queue(n_threads=4) for fpath in file_crawler.get_files_to_sync(path): q.put(drive_uploader.DUQEntry(fpath)) drive_uploader.wait_for_queue(q) input("Press any key to clean up.") google.delete(folder_id) entry = database.unify_path(path) query = db.model.select().where(db.model.path.contains(entry)) for archive in query.iterator(): archive.delete_instance() conf.exit() db.close()
def list_paths_contains(path): with db.GoogleDriveDB() as gddb: q = gddb.model.select().where(gddb.model.path.contains(path)) for archive in q.iterator(): print(archive.path)
def list_all(): with db.GoogleDriveDB() as gddb: for archive in gddb: print(archive.path)
def get_all_removed_from_local_db(): with database.GoogleDriveDB() as db: for archive in db: if not os.path.exists(archive.path): yield archive