def wtl_thread(cli_args): workflow = wtl.Workflow(config=wtl.Config(cli_args.config), policy=policy, url=cli_args.url, output=cli_args.output) workflow.classifiers.add(wtl.ActiveElementFilter(action=wtl.actions.Click, highlight=True)) workflow.run() workflow.quit()
def test_complex(): @wtl.single_tab def policy(w, view): menu_actions = view.actions.by_type(Click).by_score("menu") w.metadata["clicks"] = w.metadata["clicks"] + 1 return random.choice(menu_actions) def menu_classifier_func(elements, _): return [ elem for elem in elements if elem.location.x < 10 and elem.location.y < 200 and elem.metadata["tag"] == "a" ] config = Config.default(["headless", "desktop"]) workflow = wtl.Workflow(url=TESTURL, config=config, policy=policy, goal=wtl.goals.N_STEPS(3)) workflow.metadata["clicks"] = 0 workflow.classifiers.add(wtl.ActiveElementFilter(action=Click)) workflow.classifiers.add( wtl.ElementClassifier(name="menu", action=Click, subset="is_active", highlight=True, callback=menu_classifier_func)) workflow.run() assert workflow.success assert workflow.metadata["clicks"] == 3 workflow.quit()
def test_abort(): # Abort a workflow config = Config.default(["headless"]) workflow = wtl.Workflow( url=TESTURL, config=config, policy=lambda *_, **__: {wtl.Workflow.SINGLE_TAB: Abort()}, goal=wtl.goals.N_STEPS(3), ) workflow.run() assert not workflow.success workflow.quit()
def test_multiple(): config = Config.default(["headless"]) @wtl.multi_tab_coroutine def policy(): yield _, views = yield {} urls = set(v.snapshot.page_metadata["url"] for v in views.values()) print([v.snapshot.page_metadata["url"] for v in views.values()]) assert len(urls) == 1 _, views = yield { "1": wtl.actions.Click(wtl.Selector(".sidenav a:nth-of-type(2)")), "2": None, "3": wtl.actions.Click(wtl.Selector(".sidenav a:nth-of-type(3)")), "4": None, } urls = set(v.snapshot.page_metadata["url"] for v in views.values()) print([v.snapshot.page_metadata["url"] for v in views.values()]) assert len(urls) == 3 _, views = yield { "4": wtl.actions.Click(wtl.Selector(".sidenav a:nth-of-type(4)")) } urls = set(v.snapshot.page_metadata["url"] for v in views.values()) assert len(urls) == 4 yield {} workflow = wtl.Workflow(url={ "A": { "1": TESTURL, "2": TESTURL }, "C": { "3": TESTURL, "4": TESTURL } }, config=config, policy=policy) workflow.run() assert workflow.loop_idx == 4 workflow.quit()
def test_simple(browser): # Just navigate to a single tab, do nothing. config = Config.default(["headless", browser]) workflow = wtl.Workflow( url=TESTURL, config=config, policy=lambda *_, **__: {wtl.Workflow.SINGLE_TAB: Wait(1)}, goal=wtl.goals.N_STEPS(3), ) workflow.run() assert workflow.success workflow.quit() with pytest.raises(Error): workflow.run()
def test_mhtml_export(): OUTPUT_DIR = Path("./mhtml/") with Xvfb(): workflow = wtl.Workflow( url=TESTURL, policy=test_policy, config=wtl.Config([ "default", "browser.enable_mhtml=True", "scraping.save_mhtml=True", "debug.save=True" ]), output=OUTPUT_DIR, ) workflow.run() assert (OUTPUT_DIR / "0" / "page.mhtml").exists() assert (OUTPUT_DIR / "1" / "page.mhtml").exists() assert os.stat(OUTPUT_DIR / "0" / "page.mhtml").st_size assert os.stat(OUTPUT_DIR / "1" / "page.mhtml").st_size shutil.rmtree(OUTPUT_DIR)
def _start_btn(elements, _): return [e for e in elements if e.metadata["id"] == "sync-task-cover" and "block" in e.metadata["display"]] def _tile_div(elements, _): return [ e for e in elements if e.metadata["tag"] == "span" and e.metadata["id"].startswith("ttt") and e.tag.parent.name == "div" ] if __name__ == "__main__": cli_args = parse_cli_args() workflow = wtl.Workflow( config=wtl.Config(cli_args.config), policy=policy, url="https://stanfordnlp.github.io/miniwob-plusplus/html/miniwob/tic-tac-toe.html", output=cli_args.output, ) workflow.classifiers.add(wtl.ActiveElementFilter()) workflow.classifiers.add(wtl.ActiveElementFilter(name="start", callback=_start_btn, action=Click)) workflow.classifiers.add(wtl.ActiveElementFilter(name="tile", callback=_tile_div, action=Click)) workflow.run() workflow.quit()
# After seven deletions, start over from step 3 if workflow.loop_idx == 7: return wtl.actions.Revert(3) # Randomly pick one of the deleting actions return [ random.choice(view.actions.by_type(wtl.actions.Remove)), wtl.actions.Wait(0.25), wtl.actions.Clear(viewport=False), wtl.actions.WaitForUser(), ] if __name__ == "__main__": cli_args = parse_cli_args() wf = wtl.Workflow(config=wtl.Config(cli_args.config), policy=policy, url=cli_args.url, output=cli_args.output) wf.classifiers.add( wtl.ElementClassifier( name="dementor", enabled=True, highlight=False, action=wtl.actions.Remove, callback=lambda e, _: e, # Will label _all_ elements removable ) ) wf.run() wf.quit()
# For now, we consider all input fields where the type attribute has a specific value. return [ e for e in elements if e.metadata["tag"] == "input" and e.metadata["type"] in ("text", "email", "password") ] if __name__ == "__main__": cli_args = parse_cli_args() workflow = wtl.Workflow( config=wtl.Config(cli_args.config), policy=policy, goal=goal, url="https://www.getharvest.com/signup", output=cli_args.output, ) # We just need a text field classifier, no need to consider what's active (all of them should be). workflow.classifiers.add( wtl.ElementClassifier(name="textfield", action=FillText, callback=text_field_classifier_func, highlight=True)) workflow.run() workflow.quit() print("Workflow successful?", workflow.success)
wtl.actions.Navigate(search_url), Click(search_results[i + 1]) ] i += 1 except IndexError: print("Search result exhausted!!") break yield None if __name__ == "__main__": cli_args = parse_cli_args() wf = wtl.Workflow( config=wtl.Config(cli_args.config), policy=policy, url="https://en.wikipedia.org/wiki/Special:Random", output=cli_args.output, ) wf.classifiers.add(wtl.ActiveElementFilter(action=Click)) wf.classifiers.add( wtl.ElementClassifier(name="textfield", action=wtl.actions.FillText, highlight=True)) wf.run() wf.quit()
def policy(_, view: wtl.View) -> Dict[wtl.View, wtl.Action]: return { v: choice(v.actions.by_type(wtl.actions.Click)) for v in view.values() } if __name__ == "__main__": cli_args = parse_cli_args() workflow = wtl.Workflow( config=wtl.Config(cli_args.config), policy=policy, url={ "first": { "A": "www.uppsalahandkraft.se", "B": "https://www.uppsalamodemassa.se" }, "second": { "C": "shop.biskopsgarden.com" }, }, output=cli_args.output, ) workflow.classifiers.add(wtl.ActiveElementFilter(action=wtl.actions.Click)) workflow.run() workflow.quit()
def test_workflow(browser): config = wtl.Config.default(["headless", f"browser.browser={browser}"]) workflow = wtl.Workflow(url="about:blank", config=config, policy=wtl.policies.DUMMY) assert workflow
import webtraversallibrary as wtl from webtraversallibrary.actions import Clear, Click, Highlight from .util import parse_cli_args, start_server @wtl.single_tab_coroutine def policy(): # Highlight some titles, and then click a menu item. # Once the generator is exhausted, workflow will interpret StopIteration as cancelling the tabs. yield for i in range(1, 6): yield [ Clear(), Highlight(target=wtl.Selector(f"h2:nth-of-type({i}) > a")) ] yield Click(wtl.Selector("h2:nth-of-type(1) > a")) if __name__ == "__main__": cli_args = parse_cli_args() workflow = wtl.Workflow(config=wtl.Config(cli_args.config), policy=policy, url=start_server() + "/blog", output=cli_args.output) workflow.run() workflow.quit()