def main(): """ Main function to start the confluence-dumper. """ # Configure console for unicode output via stdout/stderr sys.stdout = codecs.getwriter('utf-8')(sys.stdout) sys.stderr = codecs.getwriter('utf-8')(sys.stderr) # Welcome output print_welcome_output() # Delete old export if os.path.exists(settings.EXPORT_FOLDER): shutil.rmtree(settings.EXPORT_FOLDER) os.makedirs(settings.EXPORT_FOLDER) # Read HTML template template_file = open(settings.TEMPLATE_FILE) html_template = template_file.read() # Fetch all spaces if spaces were not configured via settings if len(settings.SPACES_TO_EXPORT) > 0: spaces_to_export = settings.SPACES_TO_EXPORT else: spaces_to_export = [] page_url = '%s/rest/api/space?limit=25' % settings.CONFLUENCE_BASE_URL while page_url: response = utils.http_get( page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) for space in response['results']: spaces_to_export.append(space['key']) if 'next' in response['_links'].keys(): page_url = response['_links']['next'] page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url) else: page_url = None print('Exporting %d space(s): %s\n' % (len(spaces_to_export), ', '.join(spaces_to_export))) # Export spaces space_counter = 0 duplicate_space_names = {} space_matching = {} for space in spaces_to_export: space_counter += 1 # Create folders for this space space_folder_name = provide_unique_file_name(duplicate_space_names, space_matching, space, is_folder=True) space_folder = '%s/%s' % (settings.EXPORT_FOLDER, space_folder_name) try: os.makedirs(space_folder) download_folder = '%s/%s' % (space_folder, settings.DOWNLOAD_SUB_FOLDER) os.makedirs(download_folder) space_url = '%s/rest/api/space/%s?expand=homepage' % ( settings.CONFLUENCE_BASE_URL, space) response = utils.http_get( space_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) space_name = response['name'] print('SPACE (%d/%d): %s (%s)' % (space_counter, len(spaces_to_export), space_name, space)) space_page_id = settings.from_page_id or response['homepage']['id'] path_collection = fetch_page_recursively(space_page_id, space_folder, download_folder, html_template) if path_collection: # Create index file for this space space_index_path = '%s/index.html' % space_folder space_index_title = 'Index of Space %s (%s)' % (space_name, space) space_index_content = create_html_index(path_collection) utils.write_html_2_file(space_index_path, space_index_title, space_index_content, html_template) except utils.ConfluenceException as e: error_print('ERROR: %s' % e) except OSError: print( 'WARNING: The space %s has been exported already. Maybe you mentioned it twice in the settings' % space) # Finished output print_finished_output()
def fetch_page_recursively(page_id, folder_path, download_folder, html_template, depth=0, page_duplicate_file_names=None, page_file_matching=None, attachment_duplicate_file_names=None, attachment_file_matching=None): """ Fetches a Confluence page and its child pages (with referenced downloads). :param page_id: Confluence page id. :param folder_path: Folder to place downloaded pages in. :param download_folder: Folder to place downloaded files in. :param html_template: HTML template used to export Confluence pages. :param depth: (optional) Hierarchy depth of the handled Confluence page. :param page_duplicate_file_names: A dict in the structure {'<sanitized page filename>': amount of duplicates} :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'} :param attachment_duplicate_file_names: A dict in the structure {'<sanitized attachment filename>': amount of \ duplicates} :param attachment_file_matching: A dict in the structure {'<attachment title>': '<used offline filename>'} :returns: Information about downloaded files (pages, attachments, images, ...) as a dict (None for exceptions) """ if not page_duplicate_file_names: page_duplicate_file_names = {} if not page_file_matching: page_file_matching = {} if not attachment_duplicate_file_names: attachment_duplicate_file_names = {} if not attachment_file_matching: attachment_file_matching = {} page_url = '%s/rest/api/content/%s?expand=children.page,children.attachment,body.view.value' \ % (settings.CONFLUENCE_BASE_URL, page_id) try: response = utils.http_get( page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) page_content = response['body']['view']['value'] page_title = response['title'] print('%sPAGE: %s (%s)' % ('\t' * (depth + 1), page_title, page_id)) # Construct unique file name file_name = provide_unique_file_name(page_duplicate_file_names, page_file_matching, str(page_id), explicit_file_extension='html') # Remember this file and all children path_collection = { 'file_path': file_name, 'page_title': page_title, 'child_pages': [], 'child_attachments': [] } # Download attachments of this page # TODO: Outsource/Abstract the following two while loops because of much duplicate code. page_url = '%s/rest/api/content/%s/child/attachment?limit=25' % ( settings.CONFLUENCE_BASE_URL, page_id) counter = 0 while page_url: response = utils.http_get( page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) counter += len(response['results']) for attachment in response['results']: download_url = attachment['_links']['download'] attachment_id = attachment['id'][3:] attachment_info = download_attachment( download_url, download_folder, attachment_id, attachment_duplicate_file_names, attachment_file_matching, depth=depth + 1) path_collection['child_attachments'].append(attachment_info) if 'next' in response['_links'].keys(): page_url = response['_links']['next'] page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url) else: page_url = None # Export HTML file page_content = handle_html_references(page_content, page_duplicate_file_names, page_file_matching, depth=depth + 1) file_path = '%s/%s' % (folder_path, file_name) page_content += create_html_attachment_index( path_collection['child_attachments']) utils.write_html_2_file(file_path, page_title, page_content, html_template) # Save another file with page id which forwards to the original one id_file_path = '%s/%s.html' % (folder_path, page_id) id_file_page_title = 'Forward to page %s' % page_title original_file_link = utils.encode_url( utils.sanitize_for_filename(file_name)) # id_file_page_content = settings.HTML_FORWARD_MESSAGE % (original_file_link, page_title) # id_file_forward_header = '<meta http-equiv="refresh" content="0; url=%s" />' % original_file_link # utils.write_html_2_file(id_file_path, id_file_page_title, id_file_page_content, html_template, # additional_headers=[id_file_forward_header]) # Iterate through all child pages page_url = '%s/rest/api/content/%s/child/page?limit=25' % ( settings.CONFLUENCE_BASE_URL, page_id) counter = 0 while page_url: response = utils.http_get( page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) counter += len(response['results']) for child_page in response['results']: paths = fetch_page_recursively( child_page['id'], folder_path, download_folder, html_template, depth=depth + 1, page_duplicate_file_names=page_duplicate_file_names, page_file_matching=page_file_matching) if paths: path_collection['child_pages'].append(paths) if 'next' in response['_links'].keys(): page_url = response['_links']['next'] page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url) else: page_url = None return path_collection except utils.ConfluenceException as e: error_print('%sERROR: %s' % ('\t' * (depth + 1), e)) return None