Example #1
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(description='Print md5sum(s) for file/directory')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument('path',
                        help='Path of file/directory to print md5sum(s) for')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging('md5sum-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()

    LOGGER.debug('Printing md5sum(s) under {} ...'.format(args.path))
    for (root, dir_entries,
         file_entries) in walk(args.path,
                               files,
                               fields=('id', 'name', 'mimeType',
                                       'md5Checksum')):
        for file_entry in file_entries:
            if file_entry.get('name') and file_entry.get('md5Checksum'):
                print('{}  {}'.format(file_entry['md5Checksum'],
                                      file_entry['name']))
Example #2
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(
        description='Download and save metadata from drive files')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument('output_path', help='Path to output jsonl file')
    parser.add_argument('--sleep',
                        type=float,
                        help='Amount of time to sleep between files')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging(
        'download-drive-metadata-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()
    perms = service.permissions()

    with open(args.output_path, 'w') as output_file:
        downloader = FileMetadataDownloader(files, perms)
        for metadata in downloader.list():
            output_file.write(json.dumps(metadata) + '\n')
            if args.sleep:
                time.sleep(args.sleep)
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(
        description=
        'Copy files shared from an address and remove shared versions')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument('from_email',
                        help='Email address whose shared files will be copied')
    parser.add_argument('to_email',
                        help='Email address to which files will be copied')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging('migrate-drive-3-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()
    perms = service.permissions()
    parents_cache = FileCache(files)

    LOGGER.debug('Searching for files owned by {} ...'.format(args.from_email))
    file_request = files.list(
        q="'{}' in owners and not mimeType contains 'application/vnd.google-apps'"
        .format(args.from_email),
        pageSize=100,
        fields="nextPageToken, files(id, name, starred, owners, parents)")
    for (f, _) in service_method_iter(file_request, 'files', files.list_next):
        try:
            if not all(
                    parents_cache.is_owned(parent_id)
                    for parent_id in f.get('parents', [])):
                LOGGER.warning(
                    'Skipping {} in folder owned by someone else'.format(
                        f['name']))
            else:
                LOGGER.info('Copying {} and removing {}'.format(
                    f['name'], args.from_email))
                copy_response = files.copy(
                    fileId=f['id'],
                    enforceSingleParent=True,
                    fields='id',
                    body=dict(
                        (k, f[k]) for k in ('name', 'starred'))).execute()
                remove_user_permissions(perms, copy_response['id'],
                                        args.from_email)
                LOGGER.debug('Copied file id: {}; deleting {}'.format(
                    copy_response['id'], f['id']))
                remove_user_permissions(perms, f['id'], args.to_email)

        except HttpError as ex:
            LOGGER.warning('Caught exception: {}'.format(ex))
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(description='Remove unshared orphaned files')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument(
        'email', help='Email address whose unshared files will be deleted')
    parser.add_argument('--sleep',
                        type=float,
                        help='Amount of time to sleep between modifying files')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging('migrate-drive-4-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()
    perms = service.permissions()

    LOGGER.debug('Searching for files owned by {} ...'.format(args.email))
    file_request = files.list(
        q="'{}' in owners and not mimeType contains 'application/vnd.google-apps'"
        .format(args.email),
        pageSize=100,
        fields="nextPageToken, files(id, name, owners, parents)")
    for (f, _1) in service_method_iter(file_request, 'files', files.list_next):
        if not f.get('parents'):
            try:
                perm_request = perms.list(
                    fileId=f['id'],
                    pageSize=10,
                    fields="nextPageToken, permissions(id, type, emailAddress)"
                )
                for (p, _2) in service_method_iter(perm_request, 'permissions',
                                                   perms.list_next):
                    if p['type'] != 'user' or p['emailAddress'] != args.email:
                        break
                else:
                    LOGGER.info('Removing orphaned file {}'.format(f['name']))
                    files.delete(fileId=f['id']).execute()

            except HttpError as ex:
                LOGGER.warning('Caught exception: {}'.format(ex))

            if args.sleep:
                time.sleep(args.sleep)
Example #5
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(
        description='Remove old email from files owned by new email')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument('from_email',
                        help='Email address to remove from shared files')
    parser.add_argument('to_email',
                        help='Email address owning files to be updated')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging('migrate-drive-2-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()
    perms = service.permissions()
    parents_cache = FileCache(files)

    LOGGER.debug(
        'Searching for files owned by {} and shared with {} ...'.format(
            args.to_email, args.from_email))
    file_request = files.list(q="'{}' in owners and '{}' in readers".format(
        args.to_email, args.from_email),
                              pageSize=100,
                              fields="nextPageToken, files(id, name)")
    for (f, _) in service_method_iter(file_request, 'files', files.list_next):
        try:
            if not all(
                    parents_cache.is_owned(parent_id)
                    for parent_id in f.get('parents', [])):
                LOGGER.warning(
                    'Skipping {} in folder owned by someone else'.format(
                        f['name']))
            else:
                LOGGER.info('Removing {} from owned file {}'.format(
                    args.from_email, f['name']))
                remove_user_permissions(perms, f['id'], args.from_email)

        except HttpError as ex:
            LOGGER.warning('Caught exception: {}'.format(ex))
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(
        description='Load and summarize metadata from drive files')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument('email', help='Email address to use when filtering')
    parser.add_argument('input_path', help='Path to input jsonl metadata file')
    parser.add_argument('--num-top-files',
                        type=int,
                        default=100,
                        help='Number of top (largest) files to show')
    parser.add_argument('--sleep',
                        type=float,
                        help='Amount of time to sleep between modifying files')
    parser.add_argument('--delete-duplicates',
                        action='store_true',
                        help='Delete all but one copy of each file')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging(
        'summarize-drive-metadata-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()

    LOGGER.info('Loading data from {}'.format(args.input_path))
    drive_files = DriveFiles()
    with open(args.input_path) as f:
        for line in f:
            df = drive_files.add(json.loads(line))
            if df.error:
                LOGGER.warning('Error downloading metadata for {}'.format(df))

    LOGGER.info('Checking for trashed files')
    for df in drive_files.list():
        if df.trashed:
            LOGGER.warning('Trashed: {}'.format(df))

    LOGGER.info('Checking for files with no names')
    for df in drive_files.list():
        if df.metadata.get('name') is None:
            LOGGER.warning('No name: {}'.format(df))

    LOGGER.info('Checking for top-level entries beyond root')
    root_id = files.get(fileId='root').execute()['id']
    LOGGER.info('Root id: {}'.format(root_id))
    for df in drive_files.list():
        if df.id != root_id and not df.parents:
            LOGGER.warning('Top-level but not root: {}'.format(df))

    LOGGER.info('Checking for multiple parents')
    for df in drive_files.list():
        parents = df.parents
        if len(parents) > 1:
            LOGGER.warning('{} parents: {}'.format(len(parents), df))

    LOGGER.info('Checking for duplicate content')
    checksum_counts = defaultdict(list)
    for df in drive_files.list():
        checksum_counts[df.md5_checksum].append(df.path)
    for (cs, paths) in checksum_counts.items():
        if len(paths) > 1:
            LOGGER.warning(
                '{} copies of content here and elsewhere: {}'.format(
                    len(paths), paths[0]))

    LOGGER.info('Checking for duplicate content and metadata')
    metadata_counts = defaultdict(list)
    for df in drive_files.list():
        metadata_counts[(
            df.path,
            tuple(sorted(df.parent_ids)),
            df.size,
            df.md5_checksum,
            tuple(
                sorted((
                    p['type'],
                    p.get('role'),
                    p.get('emailAddress'),
                ) for p in df.permissions)),
        )].append(df.id)
    for (md, file_ids) in metadata_counts.items():
        if len(file_ids) > 1 and ('user', 'owner', args.email) in md[-1]:
            LOGGER.warning('{} copies of path: {}'.format(
                len(file_ids), md[0]))
            if args.delete_duplicates:
                try:
                    retrieved_file_ids = [
                        files.get(fileId=file_id).execute()['id']
                        for file_id in file_ids
                    ]
                    if sorted(retrieved_file_ids) == sorted(file_ids):
                        for file_id in retrieved_file_ids[1:]:
                            LOGGER.warning('Deleting {}'.format(file_id))
                            files.delete(fileId=file_id).execute()
                except HttpError as ex:
                    LOGGER.warning('Caught exception: {}'.format(ex))

                time.sleep(args.sleep)

    LOGGER.info('Listing {} largest files by size'.format(args.num_top_files))
    files_by_size = sorted(drive_files.list(),
                           key=lambda df: df.size,
                           reverse=True)
    for df in files_by_size[:args.num_top_files]:
        LOGGER.info('{:<8} {}'.format(df.human_friendly_size, df))
Example #7
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(
        description='Download and save metadata for drive files that '
        'produced errors last time, writing to new file')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument('input_path', help='Path to input jsonl file')
    parser.add_argument('output_path', help='Path to output jsonl file')
    parser.add_argument('--sleep',
                        type=float,
                        help='Amount of time to sleep between files')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging(
        'redownload-drive-metadata-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()
    perms = service.permissions()

    with open(args.input_path) as input_file, open(args.output_path,
                                                   'w') as output_file:
        downloader = FileMetadataDownloader(files, perms)
        page_token = None
        batch_info = None

        LOGGER.info('Looking for files with errors ...')
        for line in input_file:
            metadata = json.loads(line)

            old_batch_info = batch_info
            batch_info = metadata['batch_info']
            if old_batch_info is not None and (
                    batch_info['next_page_token'] !=
                    old_batch_info['next_page_token']):
                page_token = old_batch_info['next_page_token']

            if metadata['error']:
                LOGGER.info('Redownloading metadata for {}'.format(
                    metadata['name']))
                try:
                    f = files.get(
                        fileId=metadata['id'],
                        fields=', '.join(FileMetadataDownloader.
                                         DEFAULT_FILE_FIELDS)).execute()
                except HttpError as ex:
                    if ex.resp.status == 404:
                        LOGGER.warning(
                            'Skipping file, caught 404: {}'.format(ex))
                        continue
                    else:
                        raise ex
                else:
                    metadata = downloader.get(f, metadata['batch_info'])
                    if args.sleep:
                        time.sleep(args.sleep)

            output_file.write(json.dumps(metadata) + '\n')

        if batch_info is not None:
            if batch_info['item_index'] + 1 == batch_info['num_items']:
                page_token = batch_info['next_page_token']
                skip = 0
            elif page_token is not None:
                skip = batch_info['item_index'] + 1
            else:
                raise Exception(
                    'Previous pagination stopped in middle of batch and '
                    'we have no page token')

            if page_token is not None:
                LOGGER.info('Continuing pagination')
                for metadata in downloader.list(page_token=page_token):
                    if skip > 0:
                        skip -= 1
                    else:
                        output_file.write(json.dumps(metadata) + '\n')
                        if args.sleep:
                            time.sleep(args.sleep)
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser(description='Change owner of all entries in drive')
    parser.add_argument('credentials_path',
                        help='Path to credentials json file')
    parser.add_argument('from_email', help='Email address of current owner')
    parser.add_argument('to_email', help='Email address of new owner')
    args = parser.parse_args()

    credentials_name = os.path.splitext(os.path.basename(
        args.credentials_path))[0]
    configure_logging('migrate-drive-1-{}.log'.format(credentials_name))

    creds = authenticate(args.credentials_path,
                         token_path=credentials_name + '.pickle')
    service = build('drive', 'v3', credentials=creds)
    files = service.files()
    perms = service.permissions()

    LOGGER.debug('Searching for files owned by {} ...'.format(args.from_email))
    file_request = files.list(
        q="'{}' in owners and mimeType contains 'application/vnd.google-apps'".
        format(args.from_email),
        pageSize=100,
        fields="nextPageToken, files(id, name)")
    for (f, _1) in service_method_iter(file_request, 'files', files.list_next):
        LOGGER.info('Changing owner of {} to {}'.format(
            f['name'], args.to_email))
        try:
            perm_request = perms.list(
                fileId=f['id'],
                pageSize=10,
                fields="nextPageToken, permissions(id, type, emailAddress)")
            for (p, _2) in service_method_iter(perm_request, 'permissions',
                                               perms.list_next):
                if p['type'] == 'user' and p['emailAddress'] == args.to_email:
                    LOGGER.debug('Updating permission to owner for {}'.format(
                        args.to_email))
                    perms.update(
                        fileId=f['id'],
                        permissionId=p['id'],
                        transferOwnership=True,
                        body={
                            'role': 'owner'
                        },
                    ).execute()
                    break

            else:
                LOGGER.debug('Adding owner permission for {}'.format(
                    args.to_email))
                perms.create(
                    fileId=f['id'],
                    transferOwnership=True,
                    enforceSingleParent=True,
                    body={
                        'role': 'owner',
                        'type': 'user',
                        'emailAddress': args.to_email
                    },
                ).execute()

        except HttpError as ex:
            LOGGER.warning('Caught exception: {}'.format(ex))