예제 #1
0
async def read_page(browserurl, targeturl):
    """
    read a page and get the title
    originally https://github.com/HyperionGray/trio-chrome-devtools-protocol/blob/master/examples/get_title.py
    """
    logger.info("Connecting to browser: %s", browserurl)
    async with open_cdp(browserurl) as conn:
        logger.info("Listing targets")
        targets = await conn.execute(target.get_targets())
        target_id = targets[0].target_id

        logger.info("Attaching to target id=%s", target_id)
        session = await conn.open_session(target_id)

        logger.info("Navigating to %s", targeturl)
        await session.execute(page.enable())
        async with session.wait_for(page.LoadEventFired):
            await session.execute(page.navigate(targeturl))
            _, item = await session.execute(page.print_to_pdf())
            print(f"pdf is: {type(item)}")

        logger.info("Extracting page title")
        root_node = await session.execute(dom.get_document())
        title_node_id = await session.execute(
            dom.query_selector(root_node.node_id, "title")
        )
        html = await session.execute(dom.get_outer_html(title_node_id))
예제 #2
0
async def main():
    logger.info('Connecting to browser: %s', sys.argv[1])
    async with open_cdp_connection(sys.argv[1]) as conn:
        logger.info('Listing targets')
        targets = await conn.execute(target.get_targets())

        for t in targets:
            if (t.type == 'page' and not t.url.startswith('devtools://')
                    and not t.attached):
                target_id = t.target_id
                break

        logger.info('Attaching to target id=%s', target_id)
        session = await conn.open_session(target_id)

        logger.info('Navigating to %s', sys.argv[2])
        await session.execute(page.enable())
        async with session.wait_for(page.LoadEventFired):
            await session.execute(page.navigate(sys.argv[2]))

        logger.info('Extracting page title')
        root_node = await session.execute(dom.get_document())
        title_node_id = await session.execute(
            dom.query_selector(root_node.node_id, 'title'))
        html = await session.execute(dom.get_outer_html(title_node_id))
        print(html)
예제 #3
0
async def save_pdf(browserurl, targeturl, pdfpath, sleeptime):
    """
    make a pdf from a webpage
    originally https://github.com/HyperionGray/trio-chrome-devtools-protocol/blob/master/examples/screenshot.py

    Parameters

    browserurl: str
        ws address for chrome developer protocol commands

    targeturl: str
        url of page to print to pdf

    pngfile: str
        filename for png file
    """
    logger.info("Connecting to browser: %s", browserurl)
    async with open_cdp(browserurl) as conn:
        logger.info("Listing targets")
        targets = await conn.execute(target.get_targets())
        target_id = targets[0].target_id

        logger.info("Attaching to target id=%s", target_id)
        async with conn.open_session(target_id) as session:
            logger.info("Setting device emulation")
            await session.execute(
                emulation.set_device_metrics_override(
                    width=1200, height=2000, device_scale_factor=1, mobile=False
                )
            )

            logger.info("Enabling page events")
            await session.execute(page.enable())

            logger.info("Navigating to %s", targeturl)
            async with session.wait_for(page.LoadEventFired):
                await session.execute(page.navigate(url=targeturl))

            time.sleep(sleeptime)
            root_node = await session.execute(dom.get_document())
            title_node_id = await session.execute(
                dom.query_selector(root_node.node_id, "body")
            )
            body_html = await session.execute(dom.get_outer_html(title_node_id))

            logger.debug(body_html)

            logger.info("Saving a pdf")
            # TODO: make sure that javascript finishes rendering
            # await session.execute(page.capture_screenshot(format="png"))
            pdf_data, _ = await session.execute(page.print_to_pdf())

            pdf_file = await trio.open_file(pdfpath, "wb")
            async with pdf_file:
                await pdf_file.write(b64decode(pdf_data))
            logger.info(f"wrote {pdfpath}")
예제 #4
0
파일: cargo_doc.py 프로젝트: emmericp/ethox
async def convert_page(session, path, reduction_code):
    urlpath = 'file://' + os.path.abspath(path)
    logger.info('Navigating to %s', urlpath)
    async with session.wait_for(page.LoadEventFired):
        await session.execute(page.navigate(url=urlpath))

    (_, exc) = await session.execute(runtime.evaluate(reduction_code))

    root_id = (await session.execute(dom.get_document())).node_id
    main_id = await session.execute(dom.query_selector(root_id, '#main'))
    return await session.execute(dom.get_outer_html(main_id))
예제 #5
0
async def merge_pages_in(ethox_doc, ws_addr, reduction_code):
    async with open_cdp_connection(ws_addr) as conn:
        logger.info('Listing targets')
        targets = await conn.execute(target.get_targets())
        target_id = targets[0].target_id

        logger.info('Attaching to target id=%s', target_id)
        session = await conn.open_session(target_id)

        logger.info('Setting device emulation')
        await session.execute(
            emulation.set_device_metrics_override(width=800,
                                                  height=600,
                                                  device_scale_factor=1,
                                                  mobile=False))

        logger.info('Enabling page events')
        await session.execute(page.enable())

        logger.info('Starting to crawl documentation')

        contents = {}
        for doc_page in glob.iglob(os.path.join(ethox_doc, '**', '*.html'),
                                   recursive=True):
            (target_item,
             main_contents) = await convert_page(session, doc_page,
                                                 reduction_code)
            contents[target_item] = main_contents

        await convert_page(session, os.path.join(ethox_doc, 'index.html'),
                           reduction_code)
        root_id = (await session.execute(dom.get_document())).node_id
        body_id = await session.execute(dom.query_selector(root_id, 'body'))
        footer_id = await session.execute(dom.query_selector(
            root_id, 'footer'))
        main_id = await session.execute(dom.query_selector(root_id, '#main'))

        for (_, contents) in contents.items():
            cloned_id = await session.execute(
                dom.copy_to(main_id, body_id, footer_id))
            await session.execute(dom.set_outer_html(cloned_id, contents))

        await print_page(session, 'cargo_doc.pdf')
예제 #6
0
async def main():
    logger.info('Connecting to browser: %s', sys.argv[1])
    async with open_cdp_connection(sys.argv[1]) as conn:
        logger.info('Listing targets')
        targets = await conn.execute(target.get_targets())
        target_id = targets[0].target_id

        logger.info('Attaching to target id=%s', target_id)
        session = await conn.open_session(target_id)

        logger.info('Navigating to %s', sys.argv[2])
        await session.execute(page.enable())
        await session.execute(page.navigate(sys.argv[2]))
        event = await session.wait_for(page.LoadEventFired)

        logger.info('Extracting page title')
        root_node = await session.execute(dom.get_document())
        title_node_id = await session.execute(
            dom.query_selector(root_node.node_id, 'title'))
        html = await session.execute(dom.get_outer_html(title_node_id))
        print(html)