def click_and_wait(self, selector: str, click_count=1, timeout=30000) -> None: """ Clicks element which matches selector and waits for navigation. Note: It's more recommended to use the crawler's crawl method to follow links. :param selector: an element selector :param click_count: the number of clicks, defaults to 1 :param timeout: the maximum time to wait for navigation (in milliseconds), defaults to 30000 :raise CrawlerNotRunningError: if the crawler is not running :raise NoSuchElementError: if there is no element matching selector :raise NavigationTimeoutError: if the timeout is exceeded """ self._check_if_crawler_running() try: syncer.sync( asyncio.gather( self._page.waitForNavigation(options={'timeout': timeout}), self._page.click(selector, options={'clickCount': click_count}))) except PageError: raise NoSuchElementError(selector) except pyppeteer.errors.TimeoutError: raise NavigationTimeoutError(timeout)
def tearDown(self): if self.proc.returncode is None: self.proc.terminate() self.proc.wait() self.proc.poll() sync(self.wait(times=10)) super().tearDown()
def wait_for_selector(self, selector: str, visible: bool = False, hidden: bool = False, timeout: int = 30000) -> None: """ Waits until element which matches selector appears on page. :param selector: an element selector :param visible: wait for element to be present in DOM and to be visible; i.e. to not have display: none or visibility: hidden CSS properties, defaults to False :param hidden: wait for element to not be found in the DOM or to be hidden, i.e. have display: none or visibility: hidden CSS properties, defaults to False :param timeout: maximum time to wait for (in milliseconds), defaults to 30000 :raise WaitTimeoutError: if the timeout is exceeded """ self._check_if_crawler_running() # WaitTask class is not compatible with syncer, this async method is a workaround async def wait_for_selector() -> None: await self._page.waitForSelector(selector, { 'visible': visible, 'hidden': hidden, 'timeout': timeout }) try: syncer.sync(wait_for_selector()) except pyppeteer.errors.TimeoutError: raise WaitTimeoutError(timeout, selector)
def _run(self) -> None: while not self._stop_initiated and self._crawl_frontier.has_next_request( ): self._aborted_request = False self._next_request = self._crawl_frontier.get_next_request() # Send a HEAD request first self._send_head_request = True try: syncer.sync(self._page.goto(self._next_request.url)) except PageError as error: # Ignore exceptions that are caused by aborted requests if self._aborted_request: # Request was redirected, create a new crawl request for it self._handle_redirect(self._next_request, self._last_request, self._last_response) continue else: raise error # Send a GET request self._send_head_request = False self._handle_response( self._next_request, syncer.sync(self._page.goto(self._next_request.url)))
def main(): """ Main entry point of the app """ print("Starting Now") cfg = Config("config-file.txt") sourcedirList = cfg.sourceDirectories targetdir = cfg.targetDirectory path = os.path.normpath(targetdir) folder_list = iter(path.split(os.sep)) for folder in folder_list: if folder == "Volumes": drive = next(folder_list) drive = "/Volumes/" + drive break if (os.path.exists(drive)): for sourcedir in sourcedirList: if (os.path.isdir(sourcedir)): tail = getPathTail(sourcedir) newtargetdir = os.path.join(targetdir, tail) # print(newtargetdir) if not os.path.exists(newtargetdir): os.makedirs(newtargetdir) options = {"verbose": True} sync(sourcedir, newtargetdir, "sync", **options) else: print("Target disk is not mounted.") print("All Done!")
def test_wrap_async(self): async def a(): return 1 b = sync(a) self.assertEqual(b(), 1) self.assertEqual(sync(a()), 1)
def tearDown(self): if os.path.exists(self.tmp): os.remove(self.tmp) if self.proc.returncode is None: self.proc.terminate() sync(self.wait(times=10)) super().tearDown()
def test_func_error(self): def a(): return 1 with self.assertRaises(TypeError): sync(a) with self.assertRaises(TypeError): sync(a())
def test_wrap_aioco(self): @asyncio.coroutine def a(): yield from asyncio.sleep(0) return 1 b = sync(a) self.assertEqual(b(), 1) self.assertEqual(sync(a()), 1)
def setUp(self): self.doc = get_document() self.doc.body.appendChild(self.app()) self.server = start_server(port=0) self.addr = server_config['address'] self.port = server_config['port'] self.url = 'http://{}:{}/'.format(self.addr, self.port) self.page = page sync(self.page.goto(self.url))
def spider_closed(self, spider): """Shutdown the driver when spider is closed""" PrintFormatUtil.print_line("spider {} : 结束处理".format(spider.name)) if spider.crawl_type.value == 'selenium' and not self.driver is None: PrintFormatUtil.print_line("spider {} : selenium driver 销毁".format(spider.name)) self.driver.close() self.driver.quit() if spider.crawl_type.value == 'puppeeter' and not self.driver is None: PrintFormatUtil.print_line("spider {} : puppeeter driver 销毁".format(spider.name)) sync(self.driver.close())
def wait_for_timeout(self, milliseconds: int) -> None: """ Waits for timeout. :param milliseconds: the time to wait for (in milliseconds) """ self._check_if_crawler_running() syncer.sync(self._page.waitFor(milliseconds))
def test_built_query_getter_with_exception_query(self, mocked): exist = self.EM() mocked.get( 'http://127.0.0.1:8080/exist/apps/testapp/test1.xql?thing=bosh', status=200, body=ERROR_XML) with pytest.raises(ExistQueryExceptionError) as err: sync(exist.test1)(thing='bosh') str(err.value) | should.be.equal.to("[FAKEEXISTERRMSG]")
def delete_cookie(self, cookie: Cookie) -> None: """ Deletes the given cookie. :param cookie: the cookie to delete :raise CrawlerNotRunningError: if the crawler is not running """ self._check_if_crawler_running() syncer.sync(self._page.deleteCookie(cookie.as_dict()))
def setUp(self): from syncer import sync super().setUp() self.doc = get_document() self.root = self.get_elements() self.doc.body.prepend(self.root) self.server = server.start_server(port=0) self.address = server_config['address'] self.port = server_config['port'] self.url = 'http://{}:{}'.format(self.address, self.port) sync(self.page.goto(self.url)) self.element = sync(self.get_element_handle(self.root))
async def main() -> None: browsers = [] for i in range(3): browser = await launch(headless=False, args=['--no-sandbox']) # print(browser.wsEndpoint, flush=True) # endpoint = browser.wsEndpoint # browser1 = await connect(browserWSEndpoint=endpoint) browsers.append(browser) # loop = asyncio.get_event_loop() for browser in browsers: sync(test(browser))
def parse(self, response): service_pic_path = os.path.join(CONST.PIC_PATH, response.meta['r_dict']['title']) os.makedirs(service_pic_path, exist_ok=True) current_time = str(int(time.time())) service_pic_name = os.path.join(service_pic_path, current_time + ".png") service_pic_small_name = os.path.join(service_pic_path, current_time + "_s.png") service_pic_diff_name = os.path.join(service_pic_path, current_time + "_diff.png") service_pic_oc_diff_name = os.path.join(service_pic_path, current_time + "_oc_diff.png") PrintFormatUtil.print_line("pic save path {}".format(service_pic_name)) sync(response.meta['page'].screenshot({'path': service_pic_name, 'fullPage': True})) # 设置图片压缩 image = Image.open(service_pic_name) w,h = image.size PrintFormatUtil.print_line("原有图片大小 width {} height {}".format(w,h)) d_img = image.resize((int(w/2),int(h/2)),Image.ANTIALIAS) w, h = d_img.size PrintFormatUtil.print_line("处理后的图片大小 width {} height {}".format(w, h)) d_img.save(service_pic_small_name, quality=95) del response, image # 读取latest文件 latest_path = os.path.join(service_pic_path, 'latest') if os.path.exists(latest_path) and os.path.isfile(latest_path): with open(latest_path, 'r') as f: old_file_info = f.read() old_file_info = old_file_info.split(" ") old_file_info_name = old_file_info[0] old_file_info_md5 = old_file_info[1] old_service_pic_name = os.path.join(service_pic_path, old_file_info_name) PrintFormatUtil.print_line("old pic path {}".format(old_service_pic_name)) if old_file_info_md5 == FileUtil.get_md5(old_service_pic_name): PrintFormatUtil.print_line("比对图片 {} | {}".format(service_pic_small_name, old_file_info_name)) # 比对图片(PIL和OPENCV两种模式) iss = ImageSSIM(service_pic_small_name, old_service_pic_name, service_pic_diff_name) o_iss = OpenCVSSIM(service_pic_small_name, old_service_pic_name, service_pic_oc_diff_name) pil_s_code = iss.compare_images() oc_s_code = o_iss.compare_images() PrintFormatUtil.print_line("PIL库两者的相似度: {}".format(pil_s_code)) PrintFormatUtil.print_line("OPEN_CV库两者的相似度: {}".format(oc_s_code)) # 这个值可以设置(0-1), 1 非常严格 if pil_s_code < 1 and oc_s_code < 1: iss.output_diff() o_iss.output_diff() else: PrintFormatUtil.print_line("old pic md5 error. new {} old {}".format( FileUtil.get_md5(old_service_pic_name),old_file_info_md5)) # 重新生成latest文件 with open(latest_path, "w") as file: file.write(os.path.basename(service_pic_small_name) + " " + FileUtil.get_md5(service_pic_small_name))
def get_pages(self) -> List[BrowserPage]: """ Returns a list of pages in the browser. :return: a list of pages :raise CrawlerNotRunningError: if the crawler is not running """ self._check_if_crawler_running() return [ BrowserPage(index, page.url, syncer.sync(page.title())) for index, page in enumerate(syncer.sync(self._browser.pages())) ]
def set_cookie(self, cookie: Cookie) -> None: """ Sets a cookie. :param cookie: the cookie to set :raise CrawlerNotRunningError: if the crawler is not running :raise ValueError: if the cookie cannot be set due to invalid page URL """ self._check_if_crawler_running() try: syncer.sync(self._page.setCookie(cookie.as_dict())) except PageError as error: raise ValueError(error)
def test_wrap_async_args(self): async def a(b, c, d=2): return d b = sync(a) self.assertEqual(b(1, 2), 2) self.assertEqual(b(1, 2, 3), 3)
def iter_file_user_trade_list(user=0): if not user: from syncer import sync sync_get_user = U.get_or_set('sync_get_user', sync(taobao_trade.get_user)) user = sync_get_user() print('sync_get_user :'******'C:\test\taobao_trade'): if user not in f: continue b = F.read_byte(f) d = T.json_loads(b) assert d['user'] == user for n, t in enumerate(d['ts']): e = T.xpath(t, "//a[contains(., '查看物流')]") if len(e) != 1: t = T.html2text(t) t = T.replace_all_space(t)[:99] u = py.No(t) else: e = e[0] u = convert_wuliu_url(e.attrib['href']) row = [d['user'], d['n'], n, u] v.append(row) if not v: return py.No('can not found user: %s trade_list' % user) U.set(user, v) return F.dill_dump(obj=v, file=f'{user}-{len(v)}-trade_list')
def setUpClass(cls): cls.port = get_free_port() time.sleep(0.1) cls.app = get_application() cls.server = cls.app.listen(cls.port) cls.browser = launch() cls.page = sync(cls.browser.newPage())
def test_wrap_method(self): class A: async def a(self): return 1 a = A() sync_a = sync(a.a) self.assertEqual(sync_a(), 1)
def test_wrap_method_args(self): class A: async def a(self, b, c, d=2): return d a = A() sync_a = sync(a.a) self.assertEqual(sync_a(1, 2), 2) self.assertEqual(sync_a(1, 2, 3), 3)
def setUp(self): super().setUp() self.port = free_port() sync(self.wait(times=3)) cmd = [sys.executable, '-c', script, '--port', str(self.port)] + self.cmd self.url = 'http://localhost:{}'.format(self.port) self.ws_url = 'ws://localhost:{}/wdom_ws'.format(self.port) self.proc = subprocess.Popen( cmd, cwd=root, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, bufsize=0, ) sync(self.wait(times=10))
def close_page(self, page: BrowserPage) -> None: """ Closes the given page. Note: There must be at least one open page. :param page: the page to close :raise CrawlerNotRunningError: if the crawler is not running :raise ValueError: if the given page is the last one open """ self._check_if_crawler_running() pages = syncer.sync(self._browser.pages()) if len(pages) == 1: raise ValueError('Cannot close the last page') syncer.sync(pages[page.index].close())
def send_text_sync(self, channel, text, public=False, team=False, topic_name=None): return syncer.sync( self.send_text(channel, text, public, team, topic_name))
def click(self, selector: str, click_count=1) -> None: """ Clicks element which matches selector. :param selector: an element selector :param click_count: the number of clicks, defaults to 1 :raise CrawlerNotRunningError: if the crawler is not running :raise NoSuchElementError: if there is no element matching selector """ self._check_if_crawler_running() try: syncer.sync( self._page.click(selector, options={'clickCount': click_count})) except PageError: raise NoSuchElementError(selector)
def test_wrap_aioco_args(self): @asyncio.coroutine def a(b, c, d=2): yield from asyncio.sleep(0) return d b = sync(a) self.assertEqual(b(1, 2), 2) self.assertEqual(b(1, 2, 3), 3)
def run(*args, **kwargs): try: return sync(f(*args, **kwargs)) except RuntimeError: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) res = loop.run_until_complete(f(*args, **kwargs)) loop.close() return res
def get_text(self) -> str: """ Returns the text content of this element. :return: the text content of this element """ return syncer.sync(self._element_handle.executionContext.evaluate('element => element.textContent', self._element_handle))
def setUp(self): super().setUp() self.port = free_port() env = os.environ.copy() env['PYTHONPATH'] = root _ = tempfile.NamedTemporaryFile(mode='w+', suffix='.py', delete=False) with _ as f: self.tmp = f.name f.write(script) cmd = [sys.executable, self.tmp, '--port', str(self.port)] + self.cmd self.url = 'http://localhost:{}'.format(self.port) self.ws_url = 'ws://localhost:{}/rimo_ws'.format(self.port) self.proc = subprocess.Popen( cmd, cwd=curdir, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, ) sync(self.wait(times=10))
def setUp(self): super().setUp() sync(self.page.goto(self.url + 'empty')) self.result = False
def tearDown(self): sync(self.page.close())
def tearDown(self): if self.target_path.exists: self.target_path.unlink() sync(self.browser.close())
def setUp(self): self.browser = sync(launch(args=['--no-sandbox'])) self.target_path = Path(__file__).resolve().parent / 'test.pdf' if self.target_path.exists(): self.target_path.unlink()
def setUp(self): self.page = sync(self.browser.newPage()) self.result = False
def tearDownClass(cls): sync(cls.browser.close()) cls.server.stop()
def setUpClass(cls): cls.port = get_free_port() cls.app = get_application() cls.server = cls.app.listen(cls.port) cls.browser = sync(launch(DEFAULT_OPTIONS)) cls.url = 'http://localhost:{}/'.format(cls.port)
def test_future(self): f = asyncio.Future() f.set_result(1) self.assertEqual(sync(f), 1)
def setUp(self) -> None: super().setUp() self.start() sync(self.wait()) self.ws_url = 'ws://localhost:{}/rimo_ws'.format(self.port) self.ws = sync(self.ws_connect(self.ws_url))