Example #1
0
 def __init__(self, maxdepth=1000000, verbose=False, stats_only=False):
     secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                 'client_secrets.json')
     credentials_path = os.path.join(
         os.path.dirname(os.path.dirname(__file__)), 'credentials.json')
     self.drive_auth = DriveServiceAuth(secrets_path, credentials_path)
     self.drive_service = None
     self.depth = 0
     self.root_path = None
     self.maxdepth = maxdepth
     self.verbose = verbose
     self.stats_only = stats_only
     self.stats_file = None
     self.file_list = []
     print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose))
Example #2
0
 def __init__(self, maxdepth=1000000, verbose=False, stats_only=False):
     secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'client_secrets.json')
     credentials_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'credentials.json')
     self.drive_auth = DriveServiceAuth(secrets_path, credentials_path)
     self.drive_service = None
     self.depth = 0
     self.root_path = None
     self.maxdepth = maxdepth
     self.verbose = verbose
     self.stats_only = stats_only
     self.stats_file = None
     self.file_list = [ ]
     print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose))
Example #3
0
 def __init__(self, max_files=100, verbose=False, dry_run=False, mime_types=None):
     secrets_path = os.path.join(os.path.dirname(__file__), 'client_secrets.json')
     credentials_path = os.path.join(os.path.dirname(__file__), 'credentials.json')
     self.drive_auth = DriveServiceAuth(secrets_path, credentials_path)
     self.drive_service = None
     self.verbose = verbose
     self.dry_run = dry_run
     if not mime_types:
         self.mime_types = ['*/*']
     if isinstance(mime_types, (list, tuple)):
         self.mime_types = mime_types
     else:
         self.mime_types = [mime_types]
     self.max_files = max_files
     self.counter = 0
     self.tag = 'UPLOADER-' + datetime.now().replace(second=0, microsecond=0).isoformat()
     self.folders = { }
     self.links = { }
Example #4
0
class GDriveDownloader():
    def __init__(self, maxdepth=1000000, verbose=False, stats_only=False):
        secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'client_secrets.json')
        credentials_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'credentials.json')
        self.drive_auth = DriveServiceAuth(secrets_path, credentials_path)
        self.drive_service = None
        self.depth = 0
        self.root_path = None
        self.maxdepth = maxdepth
        self.verbose = verbose
        self.stats_only = stats_only
        self.stats_file = None
        self.file_list = [ ]
        print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose))

    def initService(self):
        self.drive_service = self.drive_auth.build_service()

    def getLocalTitle(self, item, metadata=None):
        local_title = None
        cleaned_title = None
        sort_priority = None
        sorted_title = None
        title = item['title'].strip()

        file_type = None
        if item['mimeType'] == 'application/vnd.google-apps.document':
            if item['kind'] == 'drive#file' and 'exportLinks' in item:
                file_type = 'text/html'

                # Override html export
                if metadata is not None and 'export_as' in metadata:
                    export_as = metadata['export_as']
                    file_type = GDRIVE_EXPORT_AS.get(export_as, None)
                    if file_type is None:
                        print('%s: export_as %s not in %r, using html' % (title, export_as, GDRIVE_EXPORT_AS.keys()))
                        file_type = 'text/html'
                    elif export_as == 'pdf' and title[-4:].lower() != '.pdf':
                        title += '.pdf'

                if file_type not in item['exportLinks']:
                    print('%s: unable to export type %s' % (title, file_type))
                    file_type = None
        elif item['mimeType'] == 'text/plain' and title.endswith('.md'):
            file_type = 'text/x-markdown'

        m = re.match(r'^(([0-9]{3})\]\s*)(.+)$', title)
        if m:
            # Hyphenated, lower-case slug
            title = m.group(3)
            local_title = slugify(title)
            sort_priority = int(m.group(2))
            # Original file name, stripped of leading underscore or trailing extensions
            cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)', '', title, flags=re.IGNORECASE)
            sorted_title = m.group(2) + ']' + cleaned_title
        else:
            # Hyphenated, lower-case slug
            local_title = slugify(title)
            sort_priority = 999
            # Original file name, stripped of leading underscore or trailing extensions
            cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)', '', title, flags=re.IGNORECASE)
            sorted_title = '999]' + cleaned_title

        if self.verbose:
            print('localTitle for "%s" kind "%s" mime "%s" ' % (item['title'], item['kind'], item['mimeType']))
            print('returning "%s" file_type "%s"' % (local_title, file_type))
        return (local_title, cleaned_title, sorted_title, sort_priority, file_type)

    # Pull description from Google Drive
    # If there is '---' at the end of description, parse remaining bits as yaml.
    def parseGDriveMeta(self, item):
        description = None
        raw_meta = None
        if 'description' in item:
            description = item['description'].strip()
            yaml_i = description.find('---')
            if yaml_i >= 0:
                raw_meta = yaml.load(description[yaml_i:].strip())
                description = description[:yaml_i].strip()
            if len(description) == 0:
                description = None

        gdrive_meta = { }
        if raw_meta is None:
            # No yaml part - create one     
            if description is not None:
                gdrive_meta['summary'] = description
        else:
            # Update yaml part
            if description is not None and 'summary' not in raw_meta:
                raw_meta['summary'] = description
            for key in [k for k in USER_META_KEYS if k in raw_meta and raw_meta[k] is not None]:
                gdrive_meta[key] = raw_meta[key]
        return gdrive_meta

    def appendStats(self, item_type, item_meta):
        item_vals = [codecs.encode(item_meta.get(f) or '', 'utf-8') for f in STATS_META_FIELDS]
        self.stats_file.write('\t'.join(item_vals))
        self.stats_file.write('\n')

    def getDownloadContent(self, download_url):
        content = None
        if download_url:
            resp, content = self.drive_service._http.request(download_url)
            if resp.status != 200:
                raise RuntimeError('An error occurred: %s' % resp)
        else:
            # The file doesn't have any content stored on Drive.
            content = ''
        return content

    def makeFolder(self, folder_item, path_to):
        local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle(folder_item)
        new_path = None
        new_folder = None
        if path_to == '':
            path_to = '/'
            new_path = local_title
            new_folder = os.path.join(self.root_path, local_title)
        else:
            new_path = os.path.join(path_to, local_title)
            new_folder = os.path.join(self.root_path, new_path)

        if not self.stats_only:
            exists_check = os.path.exists(new_folder)
            if not exists_check:
                os.mkdir(new_folder)
                if self.verbose:
                    print('Created folder "%s" in "%s"' % (local_title, path_to))

        # Pull description from Google Drive
        gdrive_meta = self.parseGDriveMeta(folder_item)
        folder_meta = {
            'author': folder_item['lastModifyingUserName'],
            'basename': local_title,
            'basename_raw': local_title,
            'date': folder_item['createdDate'],
            'dirname': path_to,
            'email': folder_item['lastModifyingUser']['emailAddress'],
            'exported_type': exported_type,
            'relative_url': local_title,
            'slug': local_title,
            'source_id': folder_item['id'],
            'source_type': folder_item['mimeType'],
            'sort_priority': sort_priority,
            'sorted_title': sorted_title,
            'summary': None, # TODO
            'template': None, # TODO
            'title': cleaned_title,
            'modified': folder_item['modifiedDate'],
            'version': folder_item['version']
        }
        folder_meta.update(gdrive_meta)

        if self.stats_only:
            self.appendStats('folder', folder_meta)
        else:
            meta_file = os.path.join(new_folder, '_folder_.yml')
            self.writeMeta(meta_file, folder_meta)
        return new_path

    def downloadFiles(self, fID_from, path_to, query_format):
        # Go through children with pagination
        query =  query_format % fID_from
        page_token = None
        while True:
            result = self.drive_service.files().list(pageToken=page_token, q=query).execute()

            # Alternative way to get children:
            #   (returns `drive#childReference` instead of `drive#file`)
            # result = self.drive_service.children().list(folderId=fID_from).execute()
            for child in result['items']:
                if child['kind'] != 'drive#file':
                    print('Unknown object type (not file or folder): "%s"' % child['kind'])
                    pp(child)

                source_type = child['mimeType']
                if source_type == 'application/vnd.google-apps.folder':
                    self.depth += 1
                    new_folder = self.makeFolder(child, path_to)
                    self.recursiveDownloadInto(child['id'], new_folder)
                    self.depth -= 1
                    # print('Returned from "%s" (id: %s)' % (child['title'], child['id']))
                    # print('  back in folder %s at depth %d' % (path_to, self.depth))

                else:
                    gdrive_meta = self.parseGDriveMeta(child)
                    local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle(child, gdrive_meta)

                    meta_name = file_name = local_title

                    # Handle .yml files
                    if re.search(r'\.yml$', file_name):
                        # Depending on how you edit or upload .yml files in Google Drive
                        # The mime type reported could be text/plain or application/octet-stream
                        # Avoid improperly dealing with Google Docs or Sheets inadvertently saved with .yml extension
                        if re.match(r'(text|application)\/', source_type) and not re.match(r'application\/vnd\.google-apps', source_type):
                            source_type = 'text/yaml'
                            exported_type = None
                        else:
                            if self.verbose:
                                print('Unknown source type for .yml file: ' % source_type)
                                sys.exit(1)

                    # Handle .html and .md exported files
                    if exported_type == 'text/html':
                        file_name += '.html'

                    raw_file_name = file_name
                    if exported_type in ['text/html', 'text/x-markdown']:
                        raw_file_name = make_raw_filename(file_name)

                    new_file = os.path.join(self.root_path, path_to, raw_file_name)
                    if self.verbose:
                        print('Trying to download "%s"' % child['title'])
                    try:
                        # Lower-case url, with .md converted to .html
                        relative_url = re.sub(r'\.md$', '.html', local_title, flags=re.IGNORECASE)

                        # Lower-case slug, stripped of .yml, .md, .html, and leading _
                        # .pdf and image extensions are left alone
                        slug = re.sub(r'\.(yml|md|html)$', '', local_title, flags=re.IGNORECASE)
                        if slug[:1] == '_' and relative_url[-5:] == '.html':
                            slug = slug[1:]

                        # Pull description from Google Drive
                        file_meta = {
                            'author': child['lastModifyingUserName'],
                            'basename': file_name,
                            'basename_raw': raw_file_name,
                            'date': child['createdDate'],
                            'dirname': path_to,
                            'email': child['lastModifyingUser']['emailAddress'],
                            'exported_type': exported_type,
                            'relative_url': relative_url,
                            'slug': slug,
                            'source_id': child['id'],
                            'source_type': source_type,
                            'sort_priority': sort_priority,
                            'sorted_title': sorted_title,
                            'summary': None,
                            'template': None,
                            'title': cleaned_title,
                            'modified': child['modifiedDate'],
                            'version': child['version']
                        }
                        file_meta.update(gdrive_meta)

                        if not self.stats_only:
                            # Download the file
                            download_url = None
                            if 'exportLinks' in child and exported_type in child['exportLinks']:
                                download_url = child['exportLinks'][exported_type]
                            elif 'downloadUrl' in child:
                                download_url = child['downloadUrl']
                            file_content = self.getDownloadContent(download_url)

                            if source_type == 'text/yaml':
                                try:
                                    source_meta = yaml.load(file_content)
                                    if isinstance(source_meta, dict):
                                        file_meta.update(source_meta)
                                    else:
                                        raise Exception('YAML object %r is not a dict' % source_meta)
                                except Exception as e:
                                    print('Error parsing YAML from %s: %s' % (download_url, e))
                            else:
                                self.writeContent(new_file, file_content)
                                meta_name = make_meta_filename(file_name)

                            meta_file = os.path.join(self.root_path, path_to, meta_name)
                            self.writeMeta(meta_file, file_meta)

                            if self.verbose:
                                print('Write to file "%s" exported as %s' % (new_file, exported_type))

                        if exported_type is not None:
                            if self.stats_only:
                                self.appendStats('file', file_meta)
                            else:
                                self.file_list.append((path_to, raw_file_name, file_name, meta_name, exported_type))

                    except Exception as e:
                        print('  Failed: %s\n' % e)
                        raise

            # Get page
            page_token = result.get('nextPageToken')
            if not page_token:
                break


    def recursiveDownloadInto(self, fID_from, path_to):
        if self.depth > self.maxdepth:
            if self.verbose:
                print('Maximum depth %d exceeded' % self.depth)
            return

        if not self.drive_service:
            self.initService()

        item = self.drive_service.files().get(fileId=fID_from).execute()
        if self.verbose:
            print('Recursively downloading "%s" (id: %s)' % (item['title'], item['id']))
            print('  into folder %s at depth %d' % (path_to, self.depth))

        if self.depth == 0:
            if self.stats_only:
                stats_fname = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'stats.tsv')
                self.stats_file = codecs.EncodedFile(open(stats_fname, 'w'), 'utf-8')
                self.stats_file.write('\t'.join(STATS_META_FIELDS))
                self.stats_file.write('\n')

            if item['kind'] == 'drive#file' and item['mimeType'] == 'application/vnd.google-apps.folder':
                self.root_path = path_to
                path_to = self.makeFolder(item, '')
            else:
                print('Top level item is not a folder')
                return

        # First get files in this folder
        self.downloadFiles(fID_from, path_to, '"%s" in parents and trashed = false and mimeType != "application/vnd.google-apps.folder"')
        # Then get subfolders in this folder
        self.downloadFiles(fID_from, path_to, '"%s" in parents and trashed = false and mimeType = "application/vnd.google-apps.folder"')

    def readMeta(self, meta_file):
        metadata = { }
        with codecs.open(meta_file, 'r', 'utf-8') as f:
            metadata = yaml.load(f)
        return metadata

    def writeMeta(self, meta_file, metadata):
        yaml_meta = yaml.safe_dump(metadata, default_flow_style=False,  explicit_start=True)
        with codecs.open(meta_file, 'w+', 'utf-8') as f:
            f.write(yaml_meta)

    def writeContent(self, content_file, content):
        # Using codecs will throw some decoding errors...
        # with codecs.open(content_file, 'w+', 'utf-8') as f:
        with open(content_file, 'w+') as f:
            f.write(content)

    def postProcessStats(self):
        pass

    def postProcessFiles(self):
        if self.verbose:
            print('Post-processing %d files' % len(self.file_list))

        for dirname, basename_raw, basename, meta_name, exported_type in self.file_list:
            file_in = os.path.join(self.root_path, dirname, basename_raw)
            file_out = os.path.join(self.root_path, dirname, basename)
            meta_file = os.path.join(self.root_path, dirname, meta_name)
            if exported_type == 'text/html':
                metadata = self.readMeta(meta_file)
                sanitize_html_file(file_in, file_out, metadata)
            elif exported_type == 'text/x-markdown':
                metadata = self.readMeta(meta_file)
                prepend_markdown_metadata(file_in, file_out, metadata)

    def postProcess(self):
        if self.stats_only:
            self.postProcessStats()
        else:
            self.postProcessFiles()
Example #5
0
class GDriveDownloader():
    def __init__(self, maxdepth=1000000, verbose=False, stats_only=False):
        secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                    'client_secrets.json')
        credentials_path = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'credentials.json')
        self.drive_auth = DriveServiceAuth(secrets_path, credentials_path)
        self.drive_service = None
        self.depth = 0
        self.root_path = None
        self.maxdepth = maxdepth
        self.verbose = verbose
        self.stats_only = stats_only
        self.stats_file = None
        self.file_list = []
        print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose))

    def initService(self):
        self.drive_service = self.drive_auth.build_service()

    def getLocalTitle(self, item, metadata=None):
        local_title = None
        cleaned_title = None
        sort_priority = None
        sorted_title = None
        title = item['title'].strip()

        file_type = None
        if item['mimeType'] == 'application/vnd.google-apps.document':
            if item['kind'] == 'drive#file' and 'exportLinks' in item:
                file_type = 'text/html'

                # Override html export
                if metadata is not None and 'export_as' in metadata:
                    export_as = metadata['export_as']
                    file_type = GDRIVE_EXPORT_AS.get(export_as, None)
                    if file_type is None:
                        print('%s: export_as %s not in %r, using html' %
                              (title, export_as, GDRIVE_EXPORT_AS.keys()))
                        file_type = 'text/html'
                    elif export_as == 'pdf' and title[-4:].lower() != '.pdf':
                        title += '.pdf'

                if file_type not in item['exportLinks']:
                    print('%s: unable to export type %s' % (title, file_type))
                    file_type = None
        elif item['mimeType'] == 'text/plain' and title.endswith('.md'):
            file_type = 'text/x-markdown'

        m = re.match(r'^(([0-9]{3})\]\s*)(.+)$', title)
        if m:
            # Hyphenated, lower-case slug
            title = m.group(3)
            local_title = slugify(title)
            sort_priority = int(m.group(2))
            # Original file name, stripped of leading underscore or trailing extensions
            cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)',
                                   '',
                                   title,
                                   flags=re.IGNORECASE)
            sorted_title = m.group(2) + ']' + cleaned_title
        else:
            # Hyphenated, lower-case slug
            local_title = slugify(title)
            sort_priority = 999
            # Original file name, stripped of leading underscore or trailing extensions
            cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)',
                                   '',
                                   title,
                                   flags=re.IGNORECASE)
            sorted_title = '999]' + cleaned_title

        if self.verbose:
            print('localTitle for "%s" kind "%s" mime "%s" ' %
                  (item['title'], item['kind'], item['mimeType']))
            print('returning "%s" file_type "%s"' % (local_title, file_type))
        return (local_title, cleaned_title, sorted_title, sort_priority,
                file_type)

    # Pull description from Google Drive
    # If there is '---' at the end of description, parse remaining bits as yaml.
    def parseGDriveMeta(self, item):
        description = None
        raw_meta = None
        if 'description' in item:
            description = item['description'].strip()
            yaml_i = description.find('---')
            if yaml_i >= 0:
                raw_meta = yaml.load(description[yaml_i:].strip())
                description = description[:yaml_i].strip()
            if len(description) == 0:
                description = None

        gdrive_meta = {}
        if raw_meta is None:
            # No yaml part - create one
            if description is not None:
                gdrive_meta['summary'] = description
        else:
            # Update yaml part
            if description is not None and 'summary' not in raw_meta:
                raw_meta['summary'] = description
            for key in [
                    k for k in USER_META_KEYS
                    if k in raw_meta and raw_meta[k] is not None
            ]:
                gdrive_meta[key] = raw_meta[key]
        return gdrive_meta

    def appendStats(self, item_type, item_meta):
        item_vals = [
            codecs.encode(item_meta.get(f) or '', 'utf-8')
            for f in STATS_META_FIELDS
        ]
        self.stats_file.write('\t'.join(item_vals))
        self.stats_file.write('\n')

    def getDownloadContent(self, download_url):
        content = None
        if download_url:
            resp, content = self.drive_service._http.request(download_url)
            if resp.status != 200:
                raise RuntimeError('An error occurred: %s' % resp)
        else:
            # The file doesn't have any content stored on Drive.
            content = ''
        return content

    def makeFolder(self, folder_item, path_to):
        local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle(
            folder_item)
        new_path = None
        new_folder = None
        if path_to == '':
            path_to = '/'
            new_path = local_title
            new_folder = os.path.join(self.root_path, local_title)
        else:
            new_path = os.path.join(path_to, local_title)
            new_folder = os.path.join(self.root_path, new_path)

        if not self.stats_only:
            exists_check = os.path.exists(new_folder)
            if not exists_check:
                os.mkdir(new_folder)
                if self.verbose:
                    print('Created folder "%s" in "%s"' %
                          (local_title, path_to))

        # Pull description from Google Drive
        gdrive_meta = self.parseGDriveMeta(folder_item)
        folder_meta = {
            'author': folder_item['lastModifyingUserName'],
            'basename': local_title,
            'basename_raw': local_title,
            'date': folder_item['createdDate'],
            'dirname': path_to,
            'email': folder_item['lastModifyingUser']['emailAddress'],
            'exported_type': exported_type,
            'relative_url': local_title,
            'slug': local_title,
            'source_id': folder_item['id'],
            'source_type': folder_item['mimeType'],
            'sort_priority': sort_priority,
            'sorted_title': sorted_title,
            'summary': None,  # TODO
            'template': None,  # TODO
            'title': cleaned_title,
            'modified': folder_item['modifiedDate'],
            'version': folder_item['version']
        }
        folder_meta.update(gdrive_meta)

        if self.stats_only:
            self.appendStats('folder', folder_meta)
        else:
            meta_file = os.path.join(new_folder, '_folder_.yml')
            self.writeMeta(meta_file, folder_meta)
        return new_path

    def downloadFiles(self, fID_from, path_to, query_format):
        # Go through children with pagination
        query = query_format % fID_from
        page_token = None
        while True:
            result = self.drive_service.files().list(pageToken=page_token,
                                                     q=query).execute()

            # Alternative way to get children:
            #   (returns `drive#childReference` instead of `drive#file`)
            # result = self.drive_service.children().list(folderId=fID_from).execute()
            for child in result['items']:
                if child['kind'] != 'drive#file':
                    print('Unknown object type (not file or folder): "%s"' %
                          child['kind'])
                    pp(child)

                source_type = child['mimeType']
                if source_type == 'application/vnd.google-apps.folder':
                    self.depth += 1
                    new_folder = self.makeFolder(child, path_to)
                    self.recursiveDownloadInto(child['id'], new_folder)
                    self.depth -= 1
                    # print('Returned from "%s" (id: %s)' % (child['title'], child['id']))
                    # print('  back in folder %s at depth %d' % (path_to, self.depth))

                else:
                    gdrive_meta = self.parseGDriveMeta(child)
                    local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle(
                        child, gdrive_meta)

                    meta_name = file_name = local_title

                    # Handle .yml files
                    if re.search(r'\.yml$', file_name):
                        # Depending on how you edit or upload .yml files in Google Drive
                        # The mime type reported could be text/plain or application/octet-stream
                        # Avoid improperly dealing with Google Docs or Sheets inadvertently saved with .yml extension
                        if re.match(r'(text|application)\/',
                                    source_type) and not re.match(
                                        r'application\/vnd\.google-apps',
                                        source_type):
                            source_type = 'text/yaml'
                            exported_type = None
                        else:
                            if self.verbose:
                                print('Unknown source type for .yml file: ' %
                                      source_type)
                                sys.exit(1)

                    # Handle .html and .md exported files
                    if exported_type == 'text/html':
                        file_name += '.html'

                    raw_file_name = file_name
                    if exported_type in ['text/html', 'text/x-markdown']:
                        raw_file_name = make_raw_filename(file_name)

                    new_file = os.path.join(self.root_path, path_to,
                                            raw_file_name)
                    if self.verbose:
                        print('Trying to download "%s"' % child['title'])
                    try:
                        # Lower-case url, with .md converted to .html
                        relative_url = re.sub(r'\.md$',
                                              '.html',
                                              local_title,
                                              flags=re.IGNORECASE)

                        # Lower-case slug, stripped of .yml, .md, .html, and leading _
                        # .pdf and image extensions are left alone
                        slug = re.sub(r'\.(yml|md|html)$',
                                      '',
                                      local_title,
                                      flags=re.IGNORECASE)
                        if slug[:1] == '_' and relative_url[-5:] == '.html':
                            slug = slug[1:]

                        # Pull description from Google Drive
                        file_meta = {
                            'author': child['lastModifyingUserName'],
                            'basename': file_name,
                            'basename_raw': raw_file_name,
                            'date': child['createdDate'],
                            'dirname': path_to,
                            'email':
                            child['lastModifyingUser']['emailAddress'],
                            'exported_type': exported_type,
                            'relative_url': relative_url,
                            'slug': slug,
                            'source_id': child['id'],
                            'source_type': source_type,
                            'sort_priority': sort_priority,
                            'sorted_title': sorted_title,
                            'summary': None,
                            'template': None,
                            'title': cleaned_title,
                            'modified': child['modifiedDate'],
                            'version': child['version']
                        }
                        file_meta.update(gdrive_meta)

                        if not self.stats_only:
                            # Download the file
                            download_url = None
                            if 'exportLinks' in child and exported_type in child[
                                    'exportLinks']:
                                download_url = child['exportLinks'][
                                    exported_type]
                            elif 'downloadUrl' in child:
                                download_url = child['downloadUrl']
                            file_content = self.getDownloadContent(
                                download_url)

                            if source_type == 'text/yaml':
                                try:
                                    source_meta = yaml.load(file_content)
                                    if isinstance(source_meta, dict):
                                        file_meta.update(source_meta)
                                    else:
                                        raise Exception(
                                            'YAML object %r is not a dict' %
                                            source_meta)
                                except Exception as e:
                                    print('Error parsing YAML from %s: %s' %
                                          (download_url, e))
                            else:
                                self.writeContent(new_file, file_content)
                                meta_name = make_meta_filename(file_name)

                            meta_file = os.path.join(self.root_path, path_to,
                                                     meta_name)
                            self.writeMeta(meta_file, file_meta)

                            if self.verbose:
                                print('Write to file "%s" exported as %s' %
                                      (new_file, exported_type))

                        if exported_type is not None:
                            if self.stats_only:
                                self.appendStats('file', file_meta)
                            else:
                                self.file_list.append(
                                    (path_to, raw_file_name, file_name,
                                     meta_name, exported_type))

                    except Exception as e:
                        print('  Failed: %s\n' % e)
                        raise

            # Get page
            page_token = result.get('nextPageToken')
            if not page_token:
                break

    def recursiveDownloadInto(self, fID_from, path_to):
        if self.depth > self.maxdepth:
            if self.verbose:
                print('Maximum depth %d exceeded' % self.depth)
            return

        if not self.drive_service:
            self.initService()

        item = self.drive_service.files().get(fileId=fID_from).execute()
        if self.verbose:
            print('Recursively downloading "%s" (id: %s)' %
                  (item['title'], item['id']))
            print('  into folder %s at depth %d' % (path_to, self.depth))

        if self.depth == 0:
            if self.stats_only:
                stats_fname = os.path.join(
                    os.path.dirname(os.path.dirname(__file__)), 'stats.tsv')
                self.stats_file = codecs.EncodedFile(open(stats_fname, 'w'),
                                                     'utf-8')
                self.stats_file.write('\t'.join(STATS_META_FIELDS))
                self.stats_file.write('\n')

            if item['kind'] == 'drive#file' and item[
                    'mimeType'] == 'application/vnd.google-apps.folder':
                self.root_path = path_to
                path_to = self.makeFolder(item, '')
            else:
                print('Top level item is not a folder')
                return

        # First get files in this folder
        self.downloadFiles(
            fID_from, path_to,
            '"%s" in parents and trashed = false and mimeType != "application/vnd.google-apps.folder"'
        )
        # Then get subfolders in this folder
        self.downloadFiles(
            fID_from, path_to,
            '"%s" in parents and trashed = false and mimeType = "application/vnd.google-apps.folder"'
        )

    def readMeta(self, meta_file):
        metadata = {}
        with codecs.open(meta_file, 'r', 'utf-8') as f:
            metadata = yaml.load(f)
        return metadata

    def writeMeta(self, meta_file, metadata):
        yaml_meta = yaml.safe_dump(metadata,
                                   default_flow_style=False,
                                   explicit_start=True)
        with codecs.open(meta_file, 'w+', 'utf-8') as f:
            f.write(yaml_meta)

    def writeContent(self, content_file, content):
        # Using codecs will throw some decoding errors...
        # with codecs.open(content_file, 'w+', 'utf-8') as f:
        with open(content_file, 'w+') as f:
            f.write(content)

    def postProcessStats(self):
        pass

    def postProcessFiles(self):
        if self.verbose:
            print('Post-processing %d files' % len(self.file_list))

        for dirname, basename_raw, basename, meta_name, exported_type in self.file_list:
            file_in = os.path.join(self.root_path, dirname, basename_raw)
            file_out = os.path.join(self.root_path, dirname, basename)
            meta_file = os.path.join(self.root_path, dirname, meta_name)
            if exported_type == 'text/html':
                metadata = self.readMeta(meta_file)
                sanitize_html_file(file_in, file_out, metadata)
            elif exported_type == 'text/x-markdown':
                metadata = self.readMeta(meta_file)
                prepend_markdown_metadata(file_in, file_out, metadata)

    def postProcess(self):
        if self.stats_only:
            self.postProcessStats()
        else:
            self.postProcessFiles()
Example #6
0
class PageUploader():

    def __init__(self, max_files=100, verbose=False, dry_run=False, mime_types=None):
        secrets_path = os.path.join(os.path.dirname(__file__), 'client_secrets.json')
        credentials_path = os.path.join(os.path.dirname(__file__), 'credentials.json')
        self.drive_auth = DriveServiceAuth(secrets_path, credentials_path)
        self.drive_service = None
        self.verbose = verbose
        self.dry_run = dry_run
        if not mime_types:
            self.mime_types = ['*/*']
        if isinstance(mime_types, (list, tuple)):
            self.mime_types = mime_types
        else:
            self.mime_types = [mime_types]
        self.max_files = max_files
        self.counter = 0
        self.tag = 'UPLOADER-' + datetime.now().replace(second=0, microsecond=0).isoformat()
        self.folders = { }
        self.links = { }

    def mockFolder(self, parent_id, metadata={}):
        folder_id = uuid.uuid4().hex
        file = { 'id': folder_id }
        file.update(metadata)
        if self.verbose and '*/*' in self.mime_types:
            print('Folder %s mocked at %s' % (file['title'], parent_id))
        return file

    def mockFile(self, parent_id, metadata={}):
        file_id = uuid.uuid4().hex
        link = 'https://docs.google.com/a/kentfieldschools.org/document/d/%s/edit?usp=drivesdk' % file_id
        file = { 'id': file_id, 'alternateLink': link }
        file.update(metadata)
        print('File %s mocked at %s' % (file['title'], parent_id))
        return file

    def addLink(self, link_url, file):
        u = urlparse(link_url)
        link = urlunparse((u.scheme, u.netloc, u.path, None, None, None)).lower()
        self.links[link] = { 'href': file['alternateLink'], 'title': file['title'], 'id': file['id'] }

    def dumpLinks(self):
        print('\n\nLINKS:')
        print('title\tfrom\tto')
        for url in sorted(self.links.keys()):
            print('%s\t%s\t%s' % (self.links[url]['title'], url, self.links[url]['href']))

    def initService(self):
        self.drive_service = self.drive_auth.build_service()
        about = self.drive_service.about().get().execute()
        root_folder_id = about['rootFolderId']
        self.folders['/'] = { 'parent': None, 'drive_id': root_folder_id }

    def searchFolder(self, name, parent_id):
        query = 'mimeType=\'application/vnd.google-apps.folder\' and trashed=false and title=\'%s\'' % name
        if parent_id:
            query += ' and \'%s\' in parents' % parent_id

        param = {
            'q': query,
            'fields': 'items(id,kind,mimeType,modifiedDate,title)'
        }
        result = []
        page_token = None
        while True:
            try:
                if page_token:
                    param['pageToken'] = page_token
                files = self.drive_service.files().list(**param).execute()
                result.extend(files['items'])
                page_token = files.get('nextPageToken')
                if not page_token:
                    break
            except apierrors.HttpError as error:
                print('An error occurred: %s' % error)

        file = None
        if result:
            file = result[0]
        return file

    def createFolder(self, title, parent_id):
        if title is None or title.strip() == '':
            raise UntitledFolder
 
        folder_metadata = {
            'title': title,
            'parents': [ { 'id': parent_id } ],
            'mimeType': 'application/vnd.google-apps.folder',
            'properties': { 'tag': self.tag },
        }


        file = None
        if self.dry_run:
            file = self.mockFolder(parent_id, folder_metadata)
        else:
            try:
                file = self.drive_service.files().insert(body=folder_metadata).execute()
            except apierrors.HttpError as error:
                print('An error occurred: %s' % error)
            time.sleep(0.5)
        return file
 
    def findOrCreateFolder(self, name, parent_id):
        created = False
        file = None
        if not self.dry_run:
            file = self.searchFolder(name, parent_id)
        if file is None:
            file = self.createFolder(name, parent_id)
            created = True
        return (file, created)

    def createFile(self, title, description, parent_id, filename, mime_type, to_mime_type=None):
        """Insert new file.

        Args:
            service: Drive API service instance.
            title: Title of the file to insert, including the extension.
            description: Description of the file to insert.
            parent_id: Parent folder's ID.
            mime_type: MIME type of the file to insert.
            filename: Filename of the file to insert.
        Returns:
            Inserted file metadata if successful, None otherwise.
        """
        if self.max_files > 0 and self.counter >= self.max_files:
            raise MaxUpload('Maximum %d files uploaded' % self.counter)

        if not('*/*' in self.mime_types or mime_type in self.mime_types):
            return None

        self.counter += 1    

        if title is None or title.strip() == '':
            raise UntitledFile

        file_metadata = {
            'title': title,
            'mimeType': to_mime_type,
            'properties': { 'tag': self.tag },
        }

        file = None
        if self.dry_run:
            file = self.mockFile(parent_id, file_metadata)
        else:
            media_body = MediaFileUpload(filename, mimetype=mime_type, resumable=True)
            if to_mime_type is None:
                to_mime_type = mime_type

            if description:
                file_metadata['description'] = description
            if parent_id:
                file_metadata['parents'] = [ { 'id': parent_id } ]

            try:
                file = self.drive_service.files().insert(
                    body=file_metadata, media_body=media_body).execute()
                print('File %s inserted at %s' % (title, parent_id))
            except apierrors.HttpError as error:
                print('An error occurred: %s' % error)
            time.sleep(0.5)
        return file
 
    def getFolderId(self, folder):
        folder_stack = [ ]
        tmpf = folder
        while not (tmpf == '/' or tmpf in self.folders):
            folder_stack.append(tmpf)
            tmpf = os.path.dirname(tmpf)

        while folder_stack:
            tmpf = folder_stack.pop()
            parent, child = os.path.split(tmpf)

            f = self.folders.get(parent)
            if not f:
                raise Exception('Bad stack - folder %s' % tmpf)
            parent_id = f.get('drive_id')
            if not parent_id:
                raise Exception('No id for %s' % parent)

            file, created = self.findOrCreateFolder(child, parent_id)
            if not file:
                raise Exception('Failed to locate Drive folder %s in %s (%s)' % (child, parent, parent_id))
            tid = file['id']
            self.folders[tmpf] = { 'parent': parent, 'drive_id': tid }

        folder_id = self.folders[folder]['drive_id']
        return folder_id

    def getFolderPathFromUrl(self, url):
        u = urlparse(url)
        folder = None
        # Remove empty '' 
        parts = u.path.split('/')[1:]
        if u.hostname == 'www.kentfieldschools.org':
            if len(parts) > 0:
                if parts[0] == 'pages':
                    folder = '/Sites/District/Pages'
                    if len(parts) > 1 and parts[1] == 'Kentfield_School_District':
                        del parts[1]
                    if len(parts) > 1:
                        if parts[1] == 'News':
                            folder = '/Sites/District/Articles'
                            parts = []
                elif parts[0] == 'files':
                    folder = '/Sites/District/Files'
                    parts = []
        elif u.hostname == 'www.edlinesites.net':
            if len(parts) > 0:
                if parts[0] == 'pages':
                    if len(parts) > 1:
                        school = parts[1].split('_', 1)[0]
                        folder = '/Sites/' + school + '/Pages'
                        del parts[1]
                    if len(parts) > 1:
                        if parts[1] == 'News':
                            folder = '/Sites/' + school + '/Articles'
                            parts = []
                elif parts[0] == 'files':
                    folder = '/Sites/District/Files'
                    parts = []
        else:
            if self.verbose:
                print('foreign host %s' % u.hostname)
            parts = []

        if len(parts) > 2:
            parts = parts[1:-1]
            folder = folder + '/' + '/'.join([re.sub(r'[_]+', ' ', p).strip() for p in parts])
        # print('%s -> %s' % (url, folder))
        return folder

    def uploadImage(self, meta):
        folder_id = None
        folder = self.getFolderPathFromUrl(meta['location'])
        if folder is None:
            if self.verbose:
                print('cannot upload image %s - no folder' % meta['location'])
        else:
            folder_id = self.getFolderId(folder)
            if folder_id is None and self.verbose:
                print('cannot upload image %s - no folder id for %s' % (meta['location'], folder))

        if folder_id:
            mime_type = meta['content_type']
            path = os.path.join(IMAGES_STORE, meta['path'])
            file = self.createFile(meta['title'], None, folder_id, path, mime_type)
            if file:
                self.addLink(meta['link_url'], file)

    def uploadFile(self, meta):
        folder_id = None
        folder = self.getFolderPathFromUrl(meta['location'])
        if folder is None:
            if self.verbose:
                print('cannot upload file %s - no folder' % meta['location'])
        else:
            folder_id = self.getFolderId(folder)
            if folder_id is None and self.verbose:
                print('cannot upload file %s - no folder id for %s' % (meta['location'], folder))

        if folder_id:
            mime_type = None
            to_mime_type = None
            path = os.path.join(FILES_STORE, meta['path'])
            file_type = get_file_type(meta['content_type'])
            if file_type == 'html':
                mime_type = 'text/html'
                to_mime_type = 'application/vnd.google-apps.document'
                cleaned_path = make_cleaned_path(path)
                if sanitize_html_file(path, cleaned_path, meta['url'], self.links):
                    path = cleaned_path
            elif file_type == 'pdf':
                mime_type = 'application/pdf'
            if mime_type is None:
                if self.verbose:
                    print('cannot upload - no mime type for %s (%s)' % (meta['path'], meta['content_type']))
            else:
                file = self.createFile(meta['title'], None, folder_id, path, mime_type, to_mime_type)
                if file:
                    self.addLink(meta['link_url'], file)

    def uploadAllItems(self, fname):
        self.initService()

        with open(fname) as data_file:    
            items = json.load(data_file)
            for item in items:
                if 'images' in item:
                    for i in range(len(item['images'])):
                        meta = item['images'][i]
                        if 'image_metas' in item and len(item['image_metas']) > i:
                            meta.update(item['image_metas'][i])
                        try:
                            self.uploadImage(meta)
                        except MaxUpload as e:
                            return
                        except Exception as e:
                            print('images[%d] %s: %r' % (i, e.__class__.__name__, meta))
                            raise
                            return
                if 'files' in item:
                    for i in range(len(item['files'])):
                        meta = item['files'][i]
                        if 'file_metas' in item and len(item['file_metas']) > i:
                            meta.update(item['file_metas'][i])
                        try:
                            self.uploadFile(meta)
                        except MaxUpload as e:
                            return
                        except Exception as e:
                            print('files[%d] %s: %r' % (i, e.__class__.__name__, meta))
                            raise
                            return
            for item in items:
                if 'inlines' in item:
                    for i in range(len(item['inlines'])):
                        meta = item['inlines'][i]
                        if 'inline_metas' in item and len(item['inline_metas']) > i:
                            meta.update(item['inline_metas'][i])
                        try:
                            self.uploadFile(meta)
                        except MaxUpload as e:
                            return
                        except Exception as e:
                            print('inlines[%d] %s: %r' % (i, e.__class__.__name__, meta))
                            raise
                            return