def main(): from argparse import ArgumentParser parser = ArgumentParser(description='Print md5sum(s) for file/directory') parser.add_argument('credentials_path', help='Path to credentials json file') parser.add_argument('path', help='Path of file/directory to print md5sum(s) for') args = parser.parse_args() credentials_name = os.path.splitext(os.path.basename( args.credentials_path))[0] configure_logging('md5sum-{}.log'.format(credentials_name)) creds = authenticate(args.credentials_path, token_path=credentials_name + '.pickle') service = build('drive', 'v3', credentials=creds) files = service.files() LOGGER.debug('Printing md5sum(s) under {} ...'.format(args.path)) for (root, dir_entries, file_entries) in walk(args.path, files, fields=('id', 'name', 'mimeType', 'md5Checksum')): for file_entry in file_entries: if file_entry.get('name') and file_entry.get('md5Checksum'): print('{} {}'.format(file_entry['md5Checksum'], file_entry['name']))
def main(): from argparse import ArgumentParser parser = ArgumentParser( description= 'Copy files shared from an address and remove shared versions') parser.add_argument('credentials_path', help='Path to credentials json file') parser.add_argument('from_email', help='Email address whose shared files will be copied') parser.add_argument('to_email', help='Email address to which files will be copied') args = parser.parse_args() credentials_name = os.path.splitext(os.path.basename( args.credentials_path))[0] configure_logging('migrate-drive-3-{}.log'.format(credentials_name)) creds = authenticate(args.credentials_path, token_path=credentials_name + '.pickle') service = build('drive', 'v3', credentials=creds) files = service.files() perms = service.permissions() parents_cache = FileCache(files) LOGGER.debug('Searching for files owned by {} ...'.format(args.from_email)) file_request = files.list( q="'{}' in owners and not mimeType contains 'application/vnd.google-apps'" .format(args.from_email), pageSize=100, fields="nextPageToken, files(id, name, starred, owners, parents)") for (f, _) in service_method_iter(file_request, 'files', files.list_next): try: if not all( parents_cache.is_owned(parent_id) for parent_id in f.get('parents', [])): LOGGER.warning( 'Skipping {} in folder owned by someone else'.format( f['name'])) else: LOGGER.info('Copying {} and removing {}'.format( f['name'], args.from_email)) copy_response = files.copy( fileId=f['id'], enforceSingleParent=True, fields='id', body=dict( (k, f[k]) for k in ('name', 'starred'))).execute() remove_user_permissions(perms, copy_response['id'], args.from_email) LOGGER.debug('Copied file id: {}; deleting {}'.format( copy_response['id'], f['id'])) remove_user_permissions(perms, f['id'], args.to_email) except HttpError as ex: LOGGER.warning('Caught exception: {}'.format(ex))
def main(): from argparse import ArgumentParser parser = ArgumentParser(description='Load and compare metadata from drive files') parser.add_argument('old_path', help='Path to jsonl metadata for old drive') parser.add_argument('new_path', help='Path to jsonl metadata for new drive') args = parser.parse_args() configure_logging('compare-drive-metadata.log') drive_files = dict(old=DriveFiles(), new=DriveFiles()) for (version, path) in (('old', args.old_path), ('new', args.new_path)): LOGGER.info('Loading {} data from {}'.format(version, path)) with open(path) as f: for line in f: drive_files[version].add(json.loads(line)) metadata_map = dict() for version in ('old', 'new'): for df in drive_files[version].list(): key = (df.name, df.size, df.md5_checksum) if key not in metadata_map: metadata_map[key] = dict(old=[], new=[]) metadata_map[key][version].append(df) LOGGER.info('Looking for files in old drive but not in new') for key in sorted(metadata_map, key=lambda k: k[1], reverse=True): copies_map = metadata_map[key] if len(copies_map['old']) > len(copies_map['new']): example_df = copies_map['old'][0] LOGGER.info('{:<8}: {:<20}: {:>3} x old, {:>3} x new'.format( example_df.human_friendly_size, example_df.name, len(copies_map['old']), len(copies_map['new'])))
def main(): from argparse import ArgumentParser parser = ArgumentParser( description='Remove old email from files owned by new email') parser.add_argument('credentials_path', help='Path to credentials json file') parser.add_argument('from_email', help='Email address to remove from shared files') parser.add_argument('to_email', help='Email address owning files to be updated') args = parser.parse_args() credentials_name = os.path.splitext(os.path.basename( args.credentials_path))[0] configure_logging('migrate-drive-2-{}.log'.format(credentials_name)) creds = authenticate(args.credentials_path, token_path=credentials_name + '.pickle') service = build('drive', 'v3', credentials=creds) files = service.files() perms = service.permissions() parents_cache = FileCache(files) LOGGER.debug( 'Searching for files owned by {} and shared with {} ...'.format( args.to_email, args.from_email)) file_request = files.list(q="'{}' in owners and '{}' in readers".format( args.to_email, args.from_email), pageSize=100, fields="nextPageToken, files(id, name)") for (f, _) in service_method_iter(file_request, 'files', files.list_next): try: if not all( parents_cache.is_owned(parent_id) for parent_id in f.get('parents', [])): LOGGER.warning( 'Skipping {} in folder owned by someone else'.format( f['name'])) else: LOGGER.info('Removing {} from owned file {}'.format( args.from_email, f['name'])) remove_user_permissions(perms, f['id'], args.from_email) except HttpError as ex: LOGGER.warning('Caught exception: {}'.format(ex))
def main(): from argparse import ArgumentParser parser = ArgumentParser(description='Remove unshared orphaned files') parser.add_argument('credentials_path', help='Path to credentials json file') parser.add_argument( 'email', help='Email address whose unshared files will be deleted') parser.add_argument('--sleep', type=float, help='Amount of time to sleep between modifying files') args = parser.parse_args() credentials_name = os.path.splitext(os.path.basename( args.credentials_path))[0] configure_logging('migrate-drive-4-{}.log'.format(credentials_name)) creds = authenticate(args.credentials_path, token_path=credentials_name + '.pickle') service = build('drive', 'v3', credentials=creds) files = service.files() perms = service.permissions() LOGGER.debug('Searching for files owned by {} ...'.format(args.email)) file_request = files.list( q="'{}' in owners and not mimeType contains 'application/vnd.google-apps'" .format(args.email), pageSize=100, fields="nextPageToken, files(id, name, owners, parents)") for (f, _1) in service_method_iter(file_request, 'files', files.list_next): if not f.get('parents'): try: perm_request = perms.list( fileId=f['id'], pageSize=10, fields="nextPageToken, permissions(id, type, emailAddress)" ) for (p, _2) in service_method_iter(perm_request, 'permissions', perms.list_next): if p['type'] != 'user' or p['emailAddress'] != args.email: break else: LOGGER.info('Removing orphaned file {}'.format(f['name'])) files.delete(fileId=f['id']).execute() except HttpError as ex: LOGGER.warning('Caught exception: {}'.format(ex)) if args.sleep: time.sleep(args.sleep)
def main(): from argparse import ArgumentParser parser = ArgumentParser( description='Load and summarize metadata from drive files') parser.add_argument('credentials_path', help='Path to credentials json file') parser.add_argument('email', help='Email address to use when filtering') parser.add_argument('input_path', help='Path to input jsonl metadata file') parser.add_argument('--num-top-files', type=int, default=100, help='Number of top (largest) files to show') parser.add_argument('--sleep', type=float, help='Amount of time to sleep between modifying files') parser.add_argument('--delete-duplicates', action='store_true', help='Delete all but one copy of each file') args = parser.parse_args() credentials_name = os.path.splitext(os.path.basename( args.credentials_path))[0] configure_logging( 'summarize-drive-metadata-{}.log'.format(credentials_name)) creds = authenticate(args.credentials_path, token_path=credentials_name + '.pickle') service = build('drive', 'v3', credentials=creds) files = service.files() LOGGER.info('Loading data from {}'.format(args.input_path)) drive_files = DriveFiles() with open(args.input_path) as f: for line in f: df = drive_files.add(json.loads(line)) if df.error: LOGGER.warning('Error downloading metadata for {}'.format(df)) LOGGER.info('Checking for trashed files') for df in drive_files.list(): if df.trashed: LOGGER.warning('Trashed: {}'.format(df)) LOGGER.info('Checking for files with no names') for df in drive_files.list(): if df.metadata.get('name') is None: LOGGER.warning('No name: {}'.format(df)) LOGGER.info('Checking for top-level entries beyond root') root_id = files.get(fileId='root').execute()['id'] LOGGER.info('Root id: {}'.format(root_id)) for df in drive_files.list(): if df.id != root_id and not df.parents: LOGGER.warning('Top-level but not root: {}'.format(df)) LOGGER.info('Checking for multiple parents') for df in drive_files.list(): parents = df.parents if len(parents) > 1: LOGGER.warning('{} parents: {}'.format(len(parents), df)) LOGGER.info('Checking for duplicate content') checksum_counts = defaultdict(list) for df in drive_files.list(): checksum_counts[df.md5_checksum].append(df.path) for (cs, paths) in checksum_counts.items(): if len(paths) > 1: LOGGER.warning( '{} copies of content here and elsewhere: {}'.format( len(paths), paths[0])) LOGGER.info('Checking for duplicate content and metadata') metadata_counts = defaultdict(list) for df in drive_files.list(): metadata_counts[( df.path, tuple(sorted(df.parent_ids)), df.size, df.md5_checksum, tuple( sorted(( p['type'], p.get('role'), p.get('emailAddress'), ) for p in df.permissions)), )].append(df.id) for (md, file_ids) in metadata_counts.items(): if len(file_ids) > 1 and ('user', 'owner', args.email) in md[-1]: LOGGER.warning('{} copies of path: {}'.format( len(file_ids), md[0])) if args.delete_duplicates: try: retrieved_file_ids = [ files.get(fileId=file_id).execute()['id'] for file_id in file_ids ] if sorted(retrieved_file_ids) == sorted(file_ids): for file_id in retrieved_file_ids[1:]: LOGGER.warning('Deleting {}'.format(file_id)) files.delete(fileId=file_id).execute() except HttpError as ex: LOGGER.warning('Caught exception: {}'.format(ex)) time.sleep(args.sleep) LOGGER.info('Listing {} largest files by size'.format(args.num_top_files)) files_by_size = sorted(drive_files.list(), key=lambda df: df.size, reverse=True) for df in files_by_size[:args.num_top_files]: LOGGER.info('{:<8} {}'.format(df.human_friendly_size, df))
def main(): from argparse import ArgumentParser parser = ArgumentParser( description='Download and save metadata for drive files that ' 'produced errors last time, writing to new file') parser.add_argument('credentials_path', help='Path to credentials json file') parser.add_argument('input_path', help='Path to input jsonl file') parser.add_argument('output_path', help='Path to output jsonl file') parser.add_argument('--sleep', type=float, help='Amount of time to sleep between files') args = parser.parse_args() credentials_name = os.path.splitext(os.path.basename( args.credentials_path))[0] configure_logging( 'redownload-drive-metadata-{}.log'.format(credentials_name)) creds = authenticate(args.credentials_path, token_path=credentials_name + '.pickle') service = build('drive', 'v3', credentials=creds) files = service.files() perms = service.permissions() with open(args.input_path) as input_file, open(args.output_path, 'w') as output_file: downloader = FileMetadataDownloader(files, perms) page_token = None batch_info = None LOGGER.info('Looking for files with errors ...') for line in input_file: metadata = json.loads(line) old_batch_info = batch_info batch_info = metadata['batch_info'] if old_batch_info is not None and ( batch_info['next_page_token'] != old_batch_info['next_page_token']): page_token = old_batch_info['next_page_token'] if metadata['error']: LOGGER.info('Redownloading metadata for {}'.format( metadata['name'])) try: f = files.get( fileId=metadata['id'], fields=', '.join(FileMetadataDownloader. DEFAULT_FILE_FIELDS)).execute() except HttpError as ex: if ex.resp.status == 404: LOGGER.warning( 'Skipping file, caught 404: {}'.format(ex)) continue else: raise ex else: metadata = downloader.get(f, metadata['batch_info']) if args.sleep: time.sleep(args.sleep) output_file.write(json.dumps(metadata) + '\n') if batch_info is not None: if batch_info['item_index'] + 1 == batch_info['num_items']: page_token = batch_info['next_page_token'] skip = 0 elif page_token is not None: skip = batch_info['item_index'] + 1 else: raise Exception( 'Previous pagination stopped in middle of batch and ' 'we have no page token') if page_token is not None: LOGGER.info('Continuing pagination') for metadata in downloader.list(page_token=page_token): if skip > 0: skip -= 1 else: output_file.write(json.dumps(metadata) + '\n') if args.sleep: time.sleep(args.sleep)
def main(): from argparse import ArgumentParser parser = ArgumentParser(description='Change owner of all entries in drive') parser.add_argument('credentials_path', help='Path to credentials json file') parser.add_argument('from_email', help='Email address of current owner') parser.add_argument('to_email', help='Email address of new owner') args = parser.parse_args() credentials_name = os.path.splitext(os.path.basename( args.credentials_path))[0] configure_logging('migrate-drive-1-{}.log'.format(credentials_name)) creds = authenticate(args.credentials_path, token_path=credentials_name + '.pickle') service = build('drive', 'v3', credentials=creds) files = service.files() perms = service.permissions() LOGGER.debug('Searching for files owned by {} ...'.format(args.from_email)) file_request = files.list( q="'{}' in owners and mimeType contains 'application/vnd.google-apps'". format(args.from_email), pageSize=100, fields="nextPageToken, files(id, name)") for (f, _1) in service_method_iter(file_request, 'files', files.list_next): LOGGER.info('Changing owner of {} to {}'.format( f['name'], args.to_email)) try: perm_request = perms.list( fileId=f['id'], pageSize=10, fields="nextPageToken, permissions(id, type, emailAddress)") for (p, _2) in service_method_iter(perm_request, 'permissions', perms.list_next): if p['type'] == 'user' and p['emailAddress'] == args.to_email: LOGGER.debug('Updating permission to owner for {}'.format( args.to_email)) perms.update( fileId=f['id'], permissionId=p['id'], transferOwnership=True, body={ 'role': 'owner' }, ).execute() break else: LOGGER.debug('Adding owner permission for {}'.format( args.to_email)) perms.create( fileId=f['id'], transferOwnership=True, enforceSingleParent=True, body={ 'role': 'owner', 'type': 'user', 'emailAddress': args.to_email }, ).execute() except HttpError as ex: LOGGER.warning('Caught exception: {}'.format(ex))