Ejemplo n.º 1
0
def create_html_index(index_content):
    """ Creates an HTML index (mainly to navigate through the exported pages).

    :param index_content: Dictionary which contains file paths, page titles and their children recursively.
    :returns: Content index as HTML.
    """
    file_path = utils.encode_url(index_content['file_path'])
    page_title = index_content['page_title']
    page_children = index_content['child_pages']

    html_content = '<a href="%s">%s</a>' % (utils.sanitize_for_filename(file_path), page_title)

    if len(page_children) > 0:
        html_content += '<ul>\n'
        for child in page_children:
            html_content += '\t<li>%s</li>\n' % create_html_index(child)
        html_content += '</ul>\n'

    return html_content
Ejemplo n.º 2
0
def provide_unique_file_name(duplicate_file_names,
                             file_matching,
                             file_title,
                             is_folder=False,
                             explicit_file_extension=None):
    """ Provides an unique AND sanitized file name for a given page title. Confluence does not allow the same page title
    in one particular space but collisions are possible after filesystem sanitization.

    :param duplicate_file_names: A dict in the structure {'<sanitized filename>': amount of duplicates}
    :param file_matching: A dict in the structure {'<file title>': '<used offline filename>'}
    :param file_title: File title which is used to generate the unique file name
    :param is_folder: (optional) Flag which states whether the file is a folder
    :param explicit_file_extension: (optional) Explicitly set file extension (e.g. 'html')
    """
    if file_title in file_matching:
        file_name = file_matching[file_title]
    else:
        file_name = utils.sanitize_for_filename(file_title)

        if is_folder:
            file_extension = None
        elif explicit_file_extension:
            file_extension = explicit_file_extension
        else:
            if '.' in file_name:
                file_name, file_extension = file_name.rsplit('.', 1)
            else:
                file_extension = None

        if file_name in duplicate_file_names:
            duplicate_file_names[file_name] += 1
            file_name = '%s_%d' % (file_name, duplicate_file_names[file_name])
        else:
            duplicate_file_names[file_name] = 0
            file_name = file_name

        if file_extension:
            file_name += '.%s' % file_extension

        file_matching[file_title] = file_name
    return file_name
Ejemplo n.º 3
0
def fetch_page_recursively(page_id,
                           folder_path,
                           download_folder,
                           html_template,
                           depth=0,
                           page_duplicate_file_names=None,
                           page_file_matching=None,
                           attachment_duplicate_file_names=None,
                           attachment_file_matching=None):
    """ Fetches a Confluence page and its child pages (with referenced downloads).

    :param page_id: Confluence page id.
    :param folder_path: Folder to place downloaded pages in.
    :param download_folder: Folder to place downloaded files in.
    :param html_template: HTML template used to export Confluence pages.
    :param depth: (optional) Hierarchy depth of the handled Confluence page.
    :param page_duplicate_file_names: A dict in the structure {'<sanitized page filename>': amount of duplicates}
    :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'}
    :param attachment_duplicate_file_names: A dict in the structure {'<sanitized attachment filename>': amount of \
                                            duplicates}
    :param attachment_file_matching: A dict in the structure {'<attachment title>': '<used offline filename>'}
    :returns: Information about downloaded files (pages, attachments, images, ...) as a dict (None for exceptions)
    """
    if not page_duplicate_file_names:
        page_duplicate_file_names = {}
    if not page_file_matching:
        page_file_matching = {}
    if not attachment_duplicate_file_names:
        attachment_duplicate_file_names = {}
    if not attachment_file_matching:
        attachment_file_matching = {}

    page_url = '%s/rest/api/content/%s?expand=children.page,children.attachment,body.view.value' \
               % (settings.CONFLUENCE_BASE_URL, page_id)
    try:
        response = utils.http_get(
            page_url,
            auth=settings.HTTP_AUTHENTICATION,
            headers=settings.HTTP_CUSTOM_HEADERS,
            verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
            proxies=settings.HTTP_PROXIES)
        page_content = response['body']['view']['value']

        page_title = response['title']
        print('%sPAGE: %s (%s)' % ('\t' * (depth + 1), page_title, page_id))

        # Construct unique file name
        file_name = provide_unique_file_name(page_duplicate_file_names,
                                             page_file_matching,
                                             str(page_id),
                                             explicit_file_extension='html')

        # Remember this file and all children
        path_collection = {
            'file_path': file_name,
            'page_title': page_title,
            'child_pages': [],
            'child_attachments': []
        }

        # Download attachments of this page
        # TODO: Outsource/Abstract the following two while loops because of much duplicate code.
        page_url = '%s/rest/api/content/%s/child/attachment?limit=25' % (
            settings.CONFLUENCE_BASE_URL, page_id)
        counter = 0
        while page_url:
            response = utils.http_get(
                page_url,
                auth=settings.HTTP_AUTHENTICATION,
                headers=settings.HTTP_CUSTOM_HEADERS,
                verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
                proxies=settings.HTTP_PROXIES)
            counter += len(response['results'])
            for attachment in response['results']:
                download_url = attachment['_links']['download']
                attachment_id = attachment['id'][3:]
                attachment_info = download_attachment(
                    download_url,
                    download_folder,
                    attachment_id,
                    attachment_duplicate_file_names,
                    attachment_file_matching,
                    depth=depth + 1)
                path_collection['child_attachments'].append(attachment_info)

            if 'next' in response['_links'].keys():
                page_url = response['_links']['next']
                page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url)
            else:
                page_url = None

        # Export HTML file
        page_content = handle_html_references(page_content,
                                              page_duplicate_file_names,
                                              page_file_matching,
                                              depth=depth + 1)
        file_path = '%s/%s' % (folder_path, file_name)
        page_content += create_html_attachment_index(
            path_collection['child_attachments'])
        utils.write_html_2_file(file_path, page_title, page_content,
                                html_template)

        # Save another file with page id which forwards to the original one
        id_file_path = '%s/%s.html' % (folder_path, page_id)
        id_file_page_title = 'Forward to page %s' % page_title
        original_file_link = utils.encode_url(
            utils.sanitize_for_filename(file_name))
        #  id_file_page_content = settings.HTML_FORWARD_MESSAGE % (original_file_link, page_title)
        #  id_file_forward_header = '<meta http-equiv="refresh" content="0; url=%s" />' % original_file_link
        #  utils.write_html_2_file(id_file_path, id_file_page_title, id_file_page_content, html_template,
        #  additional_headers=[id_file_forward_header])

        # Iterate through all child pages
        page_url = '%s/rest/api/content/%s/child/page?limit=25' % (
            settings.CONFLUENCE_BASE_URL, page_id)
        counter = 0
        while page_url:
            response = utils.http_get(
                page_url,
                auth=settings.HTTP_AUTHENTICATION,
                headers=settings.HTTP_CUSTOM_HEADERS,
                verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
                proxies=settings.HTTP_PROXIES)
            counter += len(response['results'])
            for child_page in response['results']:
                paths = fetch_page_recursively(
                    child_page['id'],
                    folder_path,
                    download_folder,
                    html_template,
                    depth=depth + 1,
                    page_duplicate_file_names=page_duplicate_file_names,
                    page_file_matching=page_file_matching)
                if paths:
                    path_collection['child_pages'].append(paths)

            if 'next' in response['_links'].keys():
                page_url = response['_links']['next']
                page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url)
            else:
                page_url = None
        return path_collection

    except utils.ConfluenceException as e:
        error_print('%sERROR: %s' % ('\t' * (depth + 1), e))
        return None
Ejemplo n.º 4
0
def handle_html_references(html_content,
                           page_duplicate_file_names,
                           page_file_matching,
                           depth=0):
    """ Repairs links in the page contents with local links.

    :param html_content: Confluence HTML content.
    :param page_duplicate_file_names: A dict in the structure {'<sanitized filename>': amount of duplicates}
    :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'}
    :param depth: (optional) Hierarchy depth of the handled Confluence page.
    :returns: Fixed HTML content.
    """
    try:
        html_tree = html.fromstring(html_content)
    except ParserError:
        print('page is empty')
        return html_content
    except XMLSyntaxError:
        print(
            '%sWARNING: Could not parse HTML content of last page. Original content will be downloaded as it is.'
            % ('\t' * (depth + 1)))
        return html_content

    # Fix links to other Confluence pages
    # Example: /display/TES/pictest1
    #       => pictest1.html
    # TODO: This code does not work for "Recent space activity" areas in space pages because of a different url format.
    xpath_expr = '//a[contains(@href, "/display/")]'
    for link_element in html_tree.xpath(xpath_expr):
        if not link_element.get('class'):
            page_title = link_element.attrib['href'].split('/')[3]
            page_title = page_title.replace('+', ' ')
            decoded_page_title = utils.decode_url(page_title)
            offline_link = provide_unique_file_name(
                page_duplicate_file_names,
                page_file_matching,
                decoded_page_title,
                explicit_file_extension='html')
            link_element.attrib['href'] = utils.encode_url(offline_link)

    # Fix links to other Confluence pages when page ids are used
    xpath_expr = '//a[contains(@href, "/pages/viewpage.action?pageId=")]'
    for link_element in html_tree.xpath(xpath_expr):
        if not link_element.get('class'):
            page_id = link_element.attrib['href'].split(
                '/pages/viewpage.action?pageId=')[1]
            offline_link = '%s.html' % utils.sanitize_for_filename(page_id)
            link_element.attrib['href'] = utils.encode_url(offline_link)

    # Fix attachment links
    xpath_expr = '//a[contains(@class, "confluence-embedded-file")]'
    for link_element in html_tree.xpath(xpath_expr):
        file_url = link_element.attrib['href']
        file_name = derive_downloaded_file_name(file_url)
        relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER,
                                        file_name)
        #link_element.attrib['href'] = utils.encode_url(relative_file_path)
        link_element.attrib['href'] = relative_file_path

    # Fix file paths for img tags
    # TODO: Handle non-<img> tags as well if necessary.
    # TODO: Support files with different versions as well if necessary.
    possible_image_xpaths = [
        '//img[contains(@src, "/download/")]',
        '//img[contains(@src, "/rest/documentConversion/latest/conversion/thumbnail/")]'
    ]
    xpath_expr = '|'.join(possible_image_xpaths)
    for img_element in html_tree.xpath(xpath_expr):
        # Replace file path
        file_url = img_element.attrib['src']
        file_name = derive_downloaded_file_name(file_url)
        relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER,
                                        file_name)
        img_element.attrib['src'] = relative_file_path

        # Add alt attribute if it does not exist yet
        if not 'alt' in img_element.attrib.keys():
            img_element.attrib['alt'] = relative_file_path

    return html.tostring(html_tree)