def __init__(self, maxdepth=1000000, verbose=False, stats_only=False): secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'client_secrets.json') credentials_path = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'credentials.json') self.drive_auth = DriveServiceAuth(secrets_path, credentials_path) self.drive_service = None self.depth = 0 self.root_path = None self.maxdepth = maxdepth self.verbose = verbose self.stats_only = stats_only self.stats_file = None self.file_list = [] print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose))
def __init__(self, maxdepth=1000000, verbose=False, stats_only=False): secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'client_secrets.json') credentials_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'credentials.json') self.drive_auth = DriveServiceAuth(secrets_path, credentials_path) self.drive_service = None self.depth = 0 self.root_path = None self.maxdepth = maxdepth self.verbose = verbose self.stats_only = stats_only self.stats_file = None self.file_list = [ ] print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose))
def __init__(self, max_files=100, verbose=False, dry_run=False, mime_types=None): secrets_path = os.path.join(os.path.dirname(__file__), 'client_secrets.json') credentials_path = os.path.join(os.path.dirname(__file__), 'credentials.json') self.drive_auth = DriveServiceAuth(secrets_path, credentials_path) self.drive_service = None self.verbose = verbose self.dry_run = dry_run if not mime_types: self.mime_types = ['*/*'] if isinstance(mime_types, (list, tuple)): self.mime_types = mime_types else: self.mime_types = [mime_types] self.max_files = max_files self.counter = 0 self.tag = 'UPLOADER-' + datetime.now().replace(second=0, microsecond=0).isoformat() self.folders = { } self.links = { }
class GDriveDownloader(): def __init__(self, maxdepth=1000000, verbose=False, stats_only=False): secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'client_secrets.json') credentials_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'credentials.json') self.drive_auth = DriveServiceAuth(secrets_path, credentials_path) self.drive_service = None self.depth = 0 self.root_path = None self.maxdepth = maxdepth self.verbose = verbose self.stats_only = stats_only self.stats_file = None self.file_list = [ ] print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose)) def initService(self): self.drive_service = self.drive_auth.build_service() def getLocalTitle(self, item, metadata=None): local_title = None cleaned_title = None sort_priority = None sorted_title = None title = item['title'].strip() file_type = None if item['mimeType'] == 'application/vnd.google-apps.document': if item['kind'] == 'drive#file' and 'exportLinks' in item: file_type = 'text/html' # Override html export if metadata is not None and 'export_as' in metadata: export_as = metadata['export_as'] file_type = GDRIVE_EXPORT_AS.get(export_as, None) if file_type is None: print('%s: export_as %s not in %r, using html' % (title, export_as, GDRIVE_EXPORT_AS.keys())) file_type = 'text/html' elif export_as == 'pdf' and title[-4:].lower() != '.pdf': title += '.pdf' if file_type not in item['exportLinks']: print('%s: unable to export type %s' % (title, file_type)) file_type = None elif item['mimeType'] == 'text/plain' and title.endswith('.md'): file_type = 'text/x-markdown' m = re.match(r'^(([0-9]{3})\]\s*)(.+)$', title) if m: # Hyphenated, lower-case slug title = m.group(3) local_title = slugify(title) sort_priority = int(m.group(2)) # Original file name, stripped of leading underscore or trailing extensions cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)', '', title, flags=re.IGNORECASE) sorted_title = m.group(2) + ']' + cleaned_title else: # Hyphenated, lower-case slug local_title = slugify(title) sort_priority = 999 # Original file name, stripped of leading underscore or trailing extensions cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)', '', title, flags=re.IGNORECASE) sorted_title = '999]' + cleaned_title if self.verbose: print('localTitle for "%s" kind "%s" mime "%s" ' % (item['title'], item['kind'], item['mimeType'])) print('returning "%s" file_type "%s"' % (local_title, file_type)) return (local_title, cleaned_title, sorted_title, sort_priority, file_type) # Pull description from Google Drive # If there is '---' at the end of description, parse remaining bits as yaml. def parseGDriveMeta(self, item): description = None raw_meta = None if 'description' in item: description = item['description'].strip() yaml_i = description.find('---') if yaml_i >= 0: raw_meta = yaml.load(description[yaml_i:].strip()) description = description[:yaml_i].strip() if len(description) == 0: description = None gdrive_meta = { } if raw_meta is None: # No yaml part - create one if description is not None: gdrive_meta['summary'] = description else: # Update yaml part if description is not None and 'summary' not in raw_meta: raw_meta['summary'] = description for key in [k for k in USER_META_KEYS if k in raw_meta and raw_meta[k] is not None]: gdrive_meta[key] = raw_meta[key] return gdrive_meta def appendStats(self, item_type, item_meta): item_vals = [codecs.encode(item_meta.get(f) or '', 'utf-8') for f in STATS_META_FIELDS] self.stats_file.write('\t'.join(item_vals)) self.stats_file.write('\n') def getDownloadContent(self, download_url): content = None if download_url: resp, content = self.drive_service._http.request(download_url) if resp.status != 200: raise RuntimeError('An error occurred: %s' % resp) else: # The file doesn't have any content stored on Drive. content = '' return content def makeFolder(self, folder_item, path_to): local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle(folder_item) new_path = None new_folder = None if path_to == '': path_to = '/' new_path = local_title new_folder = os.path.join(self.root_path, local_title) else: new_path = os.path.join(path_to, local_title) new_folder = os.path.join(self.root_path, new_path) if not self.stats_only: exists_check = os.path.exists(new_folder) if not exists_check: os.mkdir(new_folder) if self.verbose: print('Created folder "%s" in "%s"' % (local_title, path_to)) # Pull description from Google Drive gdrive_meta = self.parseGDriveMeta(folder_item) folder_meta = { 'author': folder_item['lastModifyingUserName'], 'basename': local_title, 'basename_raw': local_title, 'date': folder_item['createdDate'], 'dirname': path_to, 'email': folder_item['lastModifyingUser']['emailAddress'], 'exported_type': exported_type, 'relative_url': local_title, 'slug': local_title, 'source_id': folder_item['id'], 'source_type': folder_item['mimeType'], 'sort_priority': sort_priority, 'sorted_title': sorted_title, 'summary': None, # TODO 'template': None, # TODO 'title': cleaned_title, 'modified': folder_item['modifiedDate'], 'version': folder_item['version'] } folder_meta.update(gdrive_meta) if self.stats_only: self.appendStats('folder', folder_meta) else: meta_file = os.path.join(new_folder, '_folder_.yml') self.writeMeta(meta_file, folder_meta) return new_path def downloadFiles(self, fID_from, path_to, query_format): # Go through children with pagination query = query_format % fID_from page_token = None while True: result = self.drive_service.files().list(pageToken=page_token, q=query).execute() # Alternative way to get children: # (returns `drive#childReference` instead of `drive#file`) # result = self.drive_service.children().list(folderId=fID_from).execute() for child in result['items']: if child['kind'] != 'drive#file': print('Unknown object type (not file or folder): "%s"' % child['kind']) pp(child) source_type = child['mimeType'] if source_type == 'application/vnd.google-apps.folder': self.depth += 1 new_folder = self.makeFolder(child, path_to) self.recursiveDownloadInto(child['id'], new_folder) self.depth -= 1 # print('Returned from "%s" (id: %s)' % (child['title'], child['id'])) # print(' back in folder %s at depth %d' % (path_to, self.depth)) else: gdrive_meta = self.parseGDriveMeta(child) local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle(child, gdrive_meta) meta_name = file_name = local_title # Handle .yml files if re.search(r'\.yml$', file_name): # Depending on how you edit or upload .yml files in Google Drive # The mime type reported could be text/plain or application/octet-stream # Avoid improperly dealing with Google Docs or Sheets inadvertently saved with .yml extension if re.match(r'(text|application)\/', source_type) and not re.match(r'application\/vnd\.google-apps', source_type): source_type = 'text/yaml' exported_type = None else: if self.verbose: print('Unknown source type for .yml file: ' % source_type) sys.exit(1) # Handle .html and .md exported files if exported_type == 'text/html': file_name += '.html' raw_file_name = file_name if exported_type in ['text/html', 'text/x-markdown']: raw_file_name = make_raw_filename(file_name) new_file = os.path.join(self.root_path, path_to, raw_file_name) if self.verbose: print('Trying to download "%s"' % child['title']) try: # Lower-case url, with .md converted to .html relative_url = re.sub(r'\.md$', '.html', local_title, flags=re.IGNORECASE) # Lower-case slug, stripped of .yml, .md, .html, and leading _ # .pdf and image extensions are left alone slug = re.sub(r'\.(yml|md|html)$', '', local_title, flags=re.IGNORECASE) if slug[:1] == '_' and relative_url[-5:] == '.html': slug = slug[1:] # Pull description from Google Drive file_meta = { 'author': child['lastModifyingUserName'], 'basename': file_name, 'basename_raw': raw_file_name, 'date': child['createdDate'], 'dirname': path_to, 'email': child['lastModifyingUser']['emailAddress'], 'exported_type': exported_type, 'relative_url': relative_url, 'slug': slug, 'source_id': child['id'], 'source_type': source_type, 'sort_priority': sort_priority, 'sorted_title': sorted_title, 'summary': None, 'template': None, 'title': cleaned_title, 'modified': child['modifiedDate'], 'version': child['version'] } file_meta.update(gdrive_meta) if not self.stats_only: # Download the file download_url = None if 'exportLinks' in child and exported_type in child['exportLinks']: download_url = child['exportLinks'][exported_type] elif 'downloadUrl' in child: download_url = child['downloadUrl'] file_content = self.getDownloadContent(download_url) if source_type == 'text/yaml': try: source_meta = yaml.load(file_content) if isinstance(source_meta, dict): file_meta.update(source_meta) else: raise Exception('YAML object %r is not a dict' % source_meta) except Exception as e: print('Error parsing YAML from %s: %s' % (download_url, e)) else: self.writeContent(new_file, file_content) meta_name = make_meta_filename(file_name) meta_file = os.path.join(self.root_path, path_to, meta_name) self.writeMeta(meta_file, file_meta) if self.verbose: print('Write to file "%s" exported as %s' % (new_file, exported_type)) if exported_type is not None: if self.stats_only: self.appendStats('file', file_meta) else: self.file_list.append((path_to, raw_file_name, file_name, meta_name, exported_type)) except Exception as e: print(' Failed: %s\n' % e) raise # Get page page_token = result.get('nextPageToken') if not page_token: break def recursiveDownloadInto(self, fID_from, path_to): if self.depth > self.maxdepth: if self.verbose: print('Maximum depth %d exceeded' % self.depth) return if not self.drive_service: self.initService() item = self.drive_service.files().get(fileId=fID_from).execute() if self.verbose: print('Recursively downloading "%s" (id: %s)' % (item['title'], item['id'])) print(' into folder %s at depth %d' % (path_to, self.depth)) if self.depth == 0: if self.stats_only: stats_fname = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'stats.tsv') self.stats_file = codecs.EncodedFile(open(stats_fname, 'w'), 'utf-8') self.stats_file.write('\t'.join(STATS_META_FIELDS)) self.stats_file.write('\n') if item['kind'] == 'drive#file' and item['mimeType'] == 'application/vnd.google-apps.folder': self.root_path = path_to path_to = self.makeFolder(item, '') else: print('Top level item is not a folder') return # First get files in this folder self.downloadFiles(fID_from, path_to, '"%s" in parents and trashed = false and mimeType != "application/vnd.google-apps.folder"') # Then get subfolders in this folder self.downloadFiles(fID_from, path_to, '"%s" in parents and trashed = false and mimeType = "application/vnd.google-apps.folder"') def readMeta(self, meta_file): metadata = { } with codecs.open(meta_file, 'r', 'utf-8') as f: metadata = yaml.load(f) return metadata def writeMeta(self, meta_file, metadata): yaml_meta = yaml.safe_dump(metadata, default_flow_style=False, explicit_start=True) with codecs.open(meta_file, 'w+', 'utf-8') as f: f.write(yaml_meta) def writeContent(self, content_file, content): # Using codecs will throw some decoding errors... # with codecs.open(content_file, 'w+', 'utf-8') as f: with open(content_file, 'w+') as f: f.write(content) def postProcessStats(self): pass def postProcessFiles(self): if self.verbose: print('Post-processing %d files' % len(self.file_list)) for dirname, basename_raw, basename, meta_name, exported_type in self.file_list: file_in = os.path.join(self.root_path, dirname, basename_raw) file_out = os.path.join(self.root_path, dirname, basename) meta_file = os.path.join(self.root_path, dirname, meta_name) if exported_type == 'text/html': metadata = self.readMeta(meta_file) sanitize_html_file(file_in, file_out, metadata) elif exported_type == 'text/x-markdown': metadata = self.readMeta(meta_file) prepend_markdown_metadata(file_in, file_out, metadata) def postProcess(self): if self.stats_only: self.postProcessStats() else: self.postProcessFiles()
class GDriveDownloader(): def __init__(self, maxdepth=1000000, verbose=False, stats_only=False): secrets_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'client_secrets.json') credentials_path = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'credentials.json') self.drive_auth = DriveServiceAuth(secrets_path, credentials_path) self.drive_service = None self.depth = 0 self.root_path = None self.maxdepth = maxdepth self.verbose = verbose self.stats_only = stats_only self.stats_file = None self.file_list = [] print('GDriveDownloader maxdepth %d, verbose %r' % (maxdepth, verbose)) def initService(self): self.drive_service = self.drive_auth.build_service() def getLocalTitle(self, item, metadata=None): local_title = None cleaned_title = None sort_priority = None sorted_title = None title = item['title'].strip() file_type = None if item['mimeType'] == 'application/vnd.google-apps.document': if item['kind'] == 'drive#file' and 'exportLinks' in item: file_type = 'text/html' # Override html export if metadata is not None and 'export_as' in metadata: export_as = metadata['export_as'] file_type = GDRIVE_EXPORT_AS.get(export_as, None) if file_type is None: print('%s: export_as %s not in %r, using html' % (title, export_as, GDRIVE_EXPORT_AS.keys())) file_type = 'text/html' elif export_as == 'pdf' and title[-4:].lower() != '.pdf': title += '.pdf' if file_type not in item['exportLinks']: print('%s: unable to export type %s' % (title, file_type)) file_type = None elif item['mimeType'] == 'text/plain' and title.endswith('.md'): file_type = 'text/x-markdown' m = re.match(r'^(([0-9]{3})\]\s*)(.+)$', title) if m: # Hyphenated, lower-case slug title = m.group(3) local_title = slugify(title) sort_priority = int(m.group(2)) # Original file name, stripped of leading underscore or trailing extensions cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)', '', title, flags=re.IGNORECASE) sorted_title = m.group(2) + ']' + cleaned_title else: # Hyphenated, lower-case slug local_title = slugify(title) sort_priority = 999 # Original file name, stripped of leading underscore or trailing extensions cleaned_title = re.sub(r'(^_|[_]*\.(pdf|yml|md|html)$)', '', title, flags=re.IGNORECASE) sorted_title = '999]' + cleaned_title if self.verbose: print('localTitle for "%s" kind "%s" mime "%s" ' % (item['title'], item['kind'], item['mimeType'])) print('returning "%s" file_type "%s"' % (local_title, file_type)) return (local_title, cleaned_title, sorted_title, sort_priority, file_type) # Pull description from Google Drive # If there is '---' at the end of description, parse remaining bits as yaml. def parseGDriveMeta(self, item): description = None raw_meta = None if 'description' in item: description = item['description'].strip() yaml_i = description.find('---') if yaml_i >= 0: raw_meta = yaml.load(description[yaml_i:].strip()) description = description[:yaml_i].strip() if len(description) == 0: description = None gdrive_meta = {} if raw_meta is None: # No yaml part - create one if description is not None: gdrive_meta['summary'] = description else: # Update yaml part if description is not None and 'summary' not in raw_meta: raw_meta['summary'] = description for key in [ k for k in USER_META_KEYS if k in raw_meta and raw_meta[k] is not None ]: gdrive_meta[key] = raw_meta[key] return gdrive_meta def appendStats(self, item_type, item_meta): item_vals = [ codecs.encode(item_meta.get(f) or '', 'utf-8') for f in STATS_META_FIELDS ] self.stats_file.write('\t'.join(item_vals)) self.stats_file.write('\n') def getDownloadContent(self, download_url): content = None if download_url: resp, content = self.drive_service._http.request(download_url) if resp.status != 200: raise RuntimeError('An error occurred: %s' % resp) else: # The file doesn't have any content stored on Drive. content = '' return content def makeFolder(self, folder_item, path_to): local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle( folder_item) new_path = None new_folder = None if path_to == '': path_to = '/' new_path = local_title new_folder = os.path.join(self.root_path, local_title) else: new_path = os.path.join(path_to, local_title) new_folder = os.path.join(self.root_path, new_path) if not self.stats_only: exists_check = os.path.exists(new_folder) if not exists_check: os.mkdir(new_folder) if self.verbose: print('Created folder "%s" in "%s"' % (local_title, path_to)) # Pull description from Google Drive gdrive_meta = self.parseGDriveMeta(folder_item) folder_meta = { 'author': folder_item['lastModifyingUserName'], 'basename': local_title, 'basename_raw': local_title, 'date': folder_item['createdDate'], 'dirname': path_to, 'email': folder_item['lastModifyingUser']['emailAddress'], 'exported_type': exported_type, 'relative_url': local_title, 'slug': local_title, 'source_id': folder_item['id'], 'source_type': folder_item['mimeType'], 'sort_priority': sort_priority, 'sorted_title': sorted_title, 'summary': None, # TODO 'template': None, # TODO 'title': cleaned_title, 'modified': folder_item['modifiedDate'], 'version': folder_item['version'] } folder_meta.update(gdrive_meta) if self.stats_only: self.appendStats('folder', folder_meta) else: meta_file = os.path.join(new_folder, '_folder_.yml') self.writeMeta(meta_file, folder_meta) return new_path def downloadFiles(self, fID_from, path_to, query_format): # Go through children with pagination query = query_format % fID_from page_token = None while True: result = self.drive_service.files().list(pageToken=page_token, q=query).execute() # Alternative way to get children: # (returns `drive#childReference` instead of `drive#file`) # result = self.drive_service.children().list(folderId=fID_from).execute() for child in result['items']: if child['kind'] != 'drive#file': print('Unknown object type (not file or folder): "%s"' % child['kind']) pp(child) source_type = child['mimeType'] if source_type == 'application/vnd.google-apps.folder': self.depth += 1 new_folder = self.makeFolder(child, path_to) self.recursiveDownloadInto(child['id'], new_folder) self.depth -= 1 # print('Returned from "%s" (id: %s)' % (child['title'], child['id'])) # print(' back in folder %s at depth %d' % (path_to, self.depth)) else: gdrive_meta = self.parseGDriveMeta(child) local_title, cleaned_title, sorted_title, sort_priority, exported_type = self.getLocalTitle( child, gdrive_meta) meta_name = file_name = local_title # Handle .yml files if re.search(r'\.yml$', file_name): # Depending on how you edit or upload .yml files in Google Drive # The mime type reported could be text/plain or application/octet-stream # Avoid improperly dealing with Google Docs or Sheets inadvertently saved with .yml extension if re.match(r'(text|application)\/', source_type) and not re.match( r'application\/vnd\.google-apps', source_type): source_type = 'text/yaml' exported_type = None else: if self.verbose: print('Unknown source type for .yml file: ' % source_type) sys.exit(1) # Handle .html and .md exported files if exported_type == 'text/html': file_name += '.html' raw_file_name = file_name if exported_type in ['text/html', 'text/x-markdown']: raw_file_name = make_raw_filename(file_name) new_file = os.path.join(self.root_path, path_to, raw_file_name) if self.verbose: print('Trying to download "%s"' % child['title']) try: # Lower-case url, with .md converted to .html relative_url = re.sub(r'\.md$', '.html', local_title, flags=re.IGNORECASE) # Lower-case slug, stripped of .yml, .md, .html, and leading _ # .pdf and image extensions are left alone slug = re.sub(r'\.(yml|md|html)$', '', local_title, flags=re.IGNORECASE) if slug[:1] == '_' and relative_url[-5:] == '.html': slug = slug[1:] # Pull description from Google Drive file_meta = { 'author': child['lastModifyingUserName'], 'basename': file_name, 'basename_raw': raw_file_name, 'date': child['createdDate'], 'dirname': path_to, 'email': child['lastModifyingUser']['emailAddress'], 'exported_type': exported_type, 'relative_url': relative_url, 'slug': slug, 'source_id': child['id'], 'source_type': source_type, 'sort_priority': sort_priority, 'sorted_title': sorted_title, 'summary': None, 'template': None, 'title': cleaned_title, 'modified': child['modifiedDate'], 'version': child['version'] } file_meta.update(gdrive_meta) if not self.stats_only: # Download the file download_url = None if 'exportLinks' in child and exported_type in child[ 'exportLinks']: download_url = child['exportLinks'][ exported_type] elif 'downloadUrl' in child: download_url = child['downloadUrl'] file_content = self.getDownloadContent( download_url) if source_type == 'text/yaml': try: source_meta = yaml.load(file_content) if isinstance(source_meta, dict): file_meta.update(source_meta) else: raise Exception( 'YAML object %r is not a dict' % source_meta) except Exception as e: print('Error parsing YAML from %s: %s' % (download_url, e)) else: self.writeContent(new_file, file_content) meta_name = make_meta_filename(file_name) meta_file = os.path.join(self.root_path, path_to, meta_name) self.writeMeta(meta_file, file_meta) if self.verbose: print('Write to file "%s" exported as %s' % (new_file, exported_type)) if exported_type is not None: if self.stats_only: self.appendStats('file', file_meta) else: self.file_list.append( (path_to, raw_file_name, file_name, meta_name, exported_type)) except Exception as e: print(' Failed: %s\n' % e) raise # Get page page_token = result.get('nextPageToken') if not page_token: break def recursiveDownloadInto(self, fID_from, path_to): if self.depth > self.maxdepth: if self.verbose: print('Maximum depth %d exceeded' % self.depth) return if not self.drive_service: self.initService() item = self.drive_service.files().get(fileId=fID_from).execute() if self.verbose: print('Recursively downloading "%s" (id: %s)' % (item['title'], item['id'])) print(' into folder %s at depth %d' % (path_to, self.depth)) if self.depth == 0: if self.stats_only: stats_fname = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'stats.tsv') self.stats_file = codecs.EncodedFile(open(stats_fname, 'w'), 'utf-8') self.stats_file.write('\t'.join(STATS_META_FIELDS)) self.stats_file.write('\n') if item['kind'] == 'drive#file' and item[ 'mimeType'] == 'application/vnd.google-apps.folder': self.root_path = path_to path_to = self.makeFolder(item, '') else: print('Top level item is not a folder') return # First get files in this folder self.downloadFiles( fID_from, path_to, '"%s" in parents and trashed = false and mimeType != "application/vnd.google-apps.folder"' ) # Then get subfolders in this folder self.downloadFiles( fID_from, path_to, '"%s" in parents and trashed = false and mimeType = "application/vnd.google-apps.folder"' ) def readMeta(self, meta_file): metadata = {} with codecs.open(meta_file, 'r', 'utf-8') as f: metadata = yaml.load(f) return metadata def writeMeta(self, meta_file, metadata): yaml_meta = yaml.safe_dump(metadata, default_flow_style=False, explicit_start=True) with codecs.open(meta_file, 'w+', 'utf-8') as f: f.write(yaml_meta) def writeContent(self, content_file, content): # Using codecs will throw some decoding errors... # with codecs.open(content_file, 'w+', 'utf-8') as f: with open(content_file, 'w+') as f: f.write(content) def postProcessStats(self): pass def postProcessFiles(self): if self.verbose: print('Post-processing %d files' % len(self.file_list)) for dirname, basename_raw, basename, meta_name, exported_type in self.file_list: file_in = os.path.join(self.root_path, dirname, basename_raw) file_out = os.path.join(self.root_path, dirname, basename) meta_file = os.path.join(self.root_path, dirname, meta_name) if exported_type == 'text/html': metadata = self.readMeta(meta_file) sanitize_html_file(file_in, file_out, metadata) elif exported_type == 'text/x-markdown': metadata = self.readMeta(meta_file) prepend_markdown_metadata(file_in, file_out, metadata) def postProcess(self): if self.stats_only: self.postProcessStats() else: self.postProcessFiles()
class PageUploader(): def __init__(self, max_files=100, verbose=False, dry_run=False, mime_types=None): secrets_path = os.path.join(os.path.dirname(__file__), 'client_secrets.json') credentials_path = os.path.join(os.path.dirname(__file__), 'credentials.json') self.drive_auth = DriveServiceAuth(secrets_path, credentials_path) self.drive_service = None self.verbose = verbose self.dry_run = dry_run if not mime_types: self.mime_types = ['*/*'] if isinstance(mime_types, (list, tuple)): self.mime_types = mime_types else: self.mime_types = [mime_types] self.max_files = max_files self.counter = 0 self.tag = 'UPLOADER-' + datetime.now().replace(second=0, microsecond=0).isoformat() self.folders = { } self.links = { } def mockFolder(self, parent_id, metadata={}): folder_id = uuid.uuid4().hex file = { 'id': folder_id } file.update(metadata) if self.verbose and '*/*' in self.mime_types: print('Folder %s mocked at %s' % (file['title'], parent_id)) return file def mockFile(self, parent_id, metadata={}): file_id = uuid.uuid4().hex link = 'https://docs.google.com/a/kentfieldschools.org/document/d/%s/edit?usp=drivesdk' % file_id file = { 'id': file_id, 'alternateLink': link } file.update(metadata) print('File %s mocked at %s' % (file['title'], parent_id)) return file def addLink(self, link_url, file): u = urlparse(link_url) link = urlunparse((u.scheme, u.netloc, u.path, None, None, None)).lower() self.links[link] = { 'href': file['alternateLink'], 'title': file['title'], 'id': file['id'] } def dumpLinks(self): print('\n\nLINKS:') print('title\tfrom\tto') for url in sorted(self.links.keys()): print('%s\t%s\t%s' % (self.links[url]['title'], url, self.links[url]['href'])) def initService(self): self.drive_service = self.drive_auth.build_service() about = self.drive_service.about().get().execute() root_folder_id = about['rootFolderId'] self.folders['/'] = { 'parent': None, 'drive_id': root_folder_id } def searchFolder(self, name, parent_id): query = 'mimeType=\'application/vnd.google-apps.folder\' and trashed=false and title=\'%s\'' % name if parent_id: query += ' and \'%s\' in parents' % parent_id param = { 'q': query, 'fields': 'items(id,kind,mimeType,modifiedDate,title)' } result = [] page_token = None while True: try: if page_token: param['pageToken'] = page_token files = self.drive_service.files().list(**param).execute() result.extend(files['items']) page_token = files.get('nextPageToken') if not page_token: break except apierrors.HttpError as error: print('An error occurred: %s' % error) file = None if result: file = result[0] return file def createFolder(self, title, parent_id): if title is None or title.strip() == '': raise UntitledFolder folder_metadata = { 'title': title, 'parents': [ { 'id': parent_id } ], 'mimeType': 'application/vnd.google-apps.folder', 'properties': { 'tag': self.tag }, } file = None if self.dry_run: file = self.mockFolder(parent_id, folder_metadata) else: try: file = self.drive_service.files().insert(body=folder_metadata).execute() except apierrors.HttpError as error: print('An error occurred: %s' % error) time.sleep(0.5) return file def findOrCreateFolder(self, name, parent_id): created = False file = None if not self.dry_run: file = self.searchFolder(name, parent_id) if file is None: file = self.createFolder(name, parent_id) created = True return (file, created) def createFile(self, title, description, parent_id, filename, mime_type, to_mime_type=None): """Insert new file. Args: service: Drive API service instance. title: Title of the file to insert, including the extension. description: Description of the file to insert. parent_id: Parent folder's ID. mime_type: MIME type of the file to insert. filename: Filename of the file to insert. Returns: Inserted file metadata if successful, None otherwise. """ if self.max_files > 0 and self.counter >= self.max_files: raise MaxUpload('Maximum %d files uploaded' % self.counter) if not('*/*' in self.mime_types or mime_type in self.mime_types): return None self.counter += 1 if title is None or title.strip() == '': raise UntitledFile file_metadata = { 'title': title, 'mimeType': to_mime_type, 'properties': { 'tag': self.tag }, } file = None if self.dry_run: file = self.mockFile(parent_id, file_metadata) else: media_body = MediaFileUpload(filename, mimetype=mime_type, resumable=True) if to_mime_type is None: to_mime_type = mime_type if description: file_metadata['description'] = description if parent_id: file_metadata['parents'] = [ { 'id': parent_id } ] try: file = self.drive_service.files().insert( body=file_metadata, media_body=media_body).execute() print('File %s inserted at %s' % (title, parent_id)) except apierrors.HttpError as error: print('An error occurred: %s' % error) time.sleep(0.5) return file def getFolderId(self, folder): folder_stack = [ ] tmpf = folder while not (tmpf == '/' or tmpf in self.folders): folder_stack.append(tmpf) tmpf = os.path.dirname(tmpf) while folder_stack: tmpf = folder_stack.pop() parent, child = os.path.split(tmpf) f = self.folders.get(parent) if not f: raise Exception('Bad stack - folder %s' % tmpf) parent_id = f.get('drive_id') if not parent_id: raise Exception('No id for %s' % parent) file, created = self.findOrCreateFolder(child, parent_id) if not file: raise Exception('Failed to locate Drive folder %s in %s (%s)' % (child, parent, parent_id)) tid = file['id'] self.folders[tmpf] = { 'parent': parent, 'drive_id': tid } folder_id = self.folders[folder]['drive_id'] return folder_id def getFolderPathFromUrl(self, url): u = urlparse(url) folder = None # Remove empty '' parts = u.path.split('/')[1:] if u.hostname == 'www.kentfieldschools.org': if len(parts) > 0: if parts[0] == 'pages': folder = '/Sites/District/Pages' if len(parts) > 1 and parts[1] == 'Kentfield_School_District': del parts[1] if len(parts) > 1: if parts[1] == 'News': folder = '/Sites/District/Articles' parts = [] elif parts[0] == 'files': folder = '/Sites/District/Files' parts = [] elif u.hostname == 'www.edlinesites.net': if len(parts) > 0: if parts[0] == 'pages': if len(parts) > 1: school = parts[1].split('_', 1)[0] folder = '/Sites/' + school + '/Pages' del parts[1] if len(parts) > 1: if parts[1] == 'News': folder = '/Sites/' + school + '/Articles' parts = [] elif parts[0] == 'files': folder = '/Sites/District/Files' parts = [] else: if self.verbose: print('foreign host %s' % u.hostname) parts = [] if len(parts) > 2: parts = parts[1:-1] folder = folder + '/' + '/'.join([re.sub(r'[_]+', ' ', p).strip() for p in parts]) # print('%s -> %s' % (url, folder)) return folder def uploadImage(self, meta): folder_id = None folder = self.getFolderPathFromUrl(meta['location']) if folder is None: if self.verbose: print('cannot upload image %s - no folder' % meta['location']) else: folder_id = self.getFolderId(folder) if folder_id is None and self.verbose: print('cannot upload image %s - no folder id for %s' % (meta['location'], folder)) if folder_id: mime_type = meta['content_type'] path = os.path.join(IMAGES_STORE, meta['path']) file = self.createFile(meta['title'], None, folder_id, path, mime_type) if file: self.addLink(meta['link_url'], file) def uploadFile(self, meta): folder_id = None folder = self.getFolderPathFromUrl(meta['location']) if folder is None: if self.verbose: print('cannot upload file %s - no folder' % meta['location']) else: folder_id = self.getFolderId(folder) if folder_id is None and self.verbose: print('cannot upload file %s - no folder id for %s' % (meta['location'], folder)) if folder_id: mime_type = None to_mime_type = None path = os.path.join(FILES_STORE, meta['path']) file_type = get_file_type(meta['content_type']) if file_type == 'html': mime_type = 'text/html' to_mime_type = 'application/vnd.google-apps.document' cleaned_path = make_cleaned_path(path) if sanitize_html_file(path, cleaned_path, meta['url'], self.links): path = cleaned_path elif file_type == 'pdf': mime_type = 'application/pdf' if mime_type is None: if self.verbose: print('cannot upload - no mime type for %s (%s)' % (meta['path'], meta['content_type'])) else: file = self.createFile(meta['title'], None, folder_id, path, mime_type, to_mime_type) if file: self.addLink(meta['link_url'], file) def uploadAllItems(self, fname): self.initService() with open(fname) as data_file: items = json.load(data_file) for item in items: if 'images' in item: for i in range(len(item['images'])): meta = item['images'][i] if 'image_metas' in item and len(item['image_metas']) > i: meta.update(item['image_metas'][i]) try: self.uploadImage(meta) except MaxUpload as e: return except Exception as e: print('images[%d] %s: %r' % (i, e.__class__.__name__, meta)) raise return if 'files' in item: for i in range(len(item['files'])): meta = item['files'][i] if 'file_metas' in item and len(item['file_metas']) > i: meta.update(item['file_metas'][i]) try: self.uploadFile(meta) except MaxUpload as e: return except Exception as e: print('files[%d] %s: %r' % (i, e.__class__.__name__, meta)) raise return for item in items: if 'inlines' in item: for i in range(len(item['inlines'])): meta = item['inlines'][i] if 'inline_metas' in item and len(item['inline_metas']) > i: meta.update(item['inline_metas'][i]) try: self.uploadFile(meta) except MaxUpload as e: return except Exception as e: print('inlines[%d] %s: %r' % (i, e.__class__.__name__, meta)) raise return