async def convert_page(session, path, reduction_code): urlpath = 'file://' + os.path.abspath(path) logger.info('Navigating to %s', urlpath) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(url=urlpath)) (_, exc) = await session.execute(runtime.evaluate(reduction_code)) root_id = (await session.execute(dom.get_document())).node_id main_id = await session.execute(dom.query_selector(root_id, '#main')) target_item_id = await session.execute(dom.query_selector(main_id, 'h1')) target_item = await session.execute(dom.get_outer_html(target_item_id)) content_html = await session.execute(dom.get_outer_html(main_id)) return (target_item, content_html)
async def read_page(browserurl, targeturl): """ read a page and get the title originally https://github.com/HyperionGray/trio-chrome-devtools-protocol/blob/master/examples/get_title.py """ logger.info("Connecting to browser: %s", browserurl) async with open_cdp(browserurl) as conn: logger.info("Listing targets") targets = await conn.execute(target.get_targets()) target_id = targets[0].target_id logger.info("Attaching to target id=%s", target_id) session = await conn.open_session(target_id) logger.info("Navigating to %s", targeturl) await session.execute(page.enable()) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(targeturl)) _, item = await session.execute(page.print_to_pdf()) print(f"pdf is: {type(item)}") logger.info("Extracting page title") root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, "title") ) html = await session.execute(dom.get_outer_html(title_node_id))
async def main(): logger.info('Connecting to browser: %s', sys.argv[1]) async with open_cdp_connection(sys.argv[1]) as conn: logger.info('Listing targets') targets = await conn.execute(target.get_targets()) for t in targets: if (t.type == 'page' and not t.url.startswith('devtools://') and not t.attached): target_id = t.target_id break logger.info('Attaching to target id=%s', target_id) session = await conn.open_session(target_id) logger.info('Navigating to %s', sys.argv[2]) await session.execute(page.enable()) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(sys.argv[2])) logger.info('Extracting page title') root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, 'title')) html = await session.execute(dom.get_outer_html(title_node_id)) print(html)
async def save_pdf(browserurl, targeturl, pdfpath, sleeptime): """ make a pdf from a webpage originally https://github.com/HyperionGray/trio-chrome-devtools-protocol/blob/master/examples/screenshot.py Parameters browserurl: str ws address for chrome developer protocol commands targeturl: str url of page to print to pdf pngfile: str filename for png file """ logger.info("Connecting to browser: %s", browserurl) async with open_cdp(browserurl) as conn: logger.info("Listing targets") targets = await conn.execute(target.get_targets()) target_id = targets[0].target_id logger.info("Attaching to target id=%s", target_id) async with conn.open_session(target_id) as session: logger.info("Setting device emulation") await session.execute( emulation.set_device_metrics_override( width=1200, height=2000, device_scale_factor=1, mobile=False ) ) logger.info("Enabling page events") await session.execute(page.enable()) logger.info("Navigating to %s", targeturl) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(url=targeturl)) time.sleep(sleeptime) root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, "body") ) body_html = await session.execute(dom.get_outer_html(title_node_id)) logger.debug(body_html) logger.info("Saving a pdf") # TODO: make sure that javascript finishes rendering # await session.execute(page.capture_screenshot(format="png")) pdf_data, _ = await session.execute(page.print_to_pdf()) pdf_file = await trio.open_file(pdfpath, "wb") async with pdf_file: await pdf_file.write(b64decode(pdf_data)) logger.info(f"wrote {pdfpath}")
async def main(): logger.info('Connecting to browser: %s', sys.argv[1]) async with open_cdp_connection(sys.argv[1]) as conn: logger.info('Listing targets') targets = await conn.execute(target.get_targets()) target_id = targets[0].target_id logger.info('Attaching to target id=%s', target_id) session = await conn.open_session(target_id) logger.info('Navigating to %s', sys.argv[2]) await session.execute(page.enable()) await session.execute(page.navigate(sys.argv[2])) event = await session.wait_for(page.LoadEventFired) logger.info('Extracting page title') root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, 'title')) html = await session.execute(dom.get_outer_html(title_node_id)) print(html)