Beispiel #1
0
    def click_and_wait(self,
                       selector: str,
                       click_count=1,
                       timeout=30000) -> None:
        """
        Clicks element which matches selector and waits for navigation.
        Note: It's more recommended to use the crawler's crawl method to follow links.

        :param selector: an element selector
        :param click_count: the number of clicks, defaults to 1
        :param timeout: the maximum time to wait for navigation (in milliseconds), defaults to 30000
        :raise CrawlerNotRunningError: if the crawler is not running
        :raise NoSuchElementError: if there is no element matching selector
        :raise NavigationTimeoutError: if the timeout is exceeded
        """

        self._check_if_crawler_running()

        try:
            syncer.sync(
                asyncio.gather(
                    self._page.waitForNavigation(options={'timeout': timeout}),
                    self._page.click(selector,
                                     options={'clickCount': click_count})))
        except PageError:
            raise NoSuchElementError(selector)
        except pyppeteer.errors.TimeoutError:
            raise NavigationTimeoutError(timeout)
Beispiel #2
0
 def tearDown(self):
     if self.proc.returncode is None:
         self.proc.terminate()
     self.proc.wait()
     self.proc.poll()
     sync(self.wait(times=10))
     super().tearDown()
Beispiel #3
0
    def wait_for_selector(self,
                          selector: str,
                          visible: bool = False,
                          hidden: bool = False,
                          timeout: int = 30000) -> None:
        """
        Waits until element which matches selector appears on page.

        :param selector: an element selector
        :param visible: wait for element to be present in DOM and to be
                        visible; i.e. to not have display: none or visibility: hidden
                        CSS properties, defaults to False
        :param hidden: wait for element to not be found in the DOM or to
                       be hidden, i.e. have display: none or visibility: hidden CSS
                       properties, defaults to False
        :param timeout: maximum time to wait for (in milliseconds), defaults to 30000
        :raise WaitTimeoutError: if the timeout is exceeded
        """

        self._check_if_crawler_running()

        # WaitTask class is not compatible with syncer, this async method is a workaround
        async def wait_for_selector() -> None:
            await self._page.waitForSelector(selector, {
                'visible': visible,
                'hidden': hidden,
                'timeout': timeout
            })

        try:
            syncer.sync(wait_for_selector())
        except pyppeteer.errors.TimeoutError:
            raise WaitTimeoutError(timeout, selector)
Beispiel #4
0
    def _run(self) -> None:
        while not self._stop_initiated and self._crawl_frontier.has_next_request(
        ):
            self._aborted_request = False
            self._next_request = self._crawl_frontier.get_next_request()

            # Send a HEAD request first
            self._send_head_request = True
            try:
                syncer.sync(self._page.goto(self._next_request.url))
            except PageError as error:
                # Ignore exceptions that are caused by aborted requests
                if self._aborted_request:
                    # Request was redirected, create a new crawl request for it
                    self._handle_redirect(self._next_request,
                                          self._last_request,
                                          self._last_response)
                    continue
                else:
                    raise error

            # Send a GET request
            self._send_head_request = False
            self._handle_response(
                self._next_request,
                syncer.sync(self._page.goto(self._next_request.url)))
Beispiel #5
0
def main():
    """ Main entry point of the app """
    print("Starting Now")
    cfg = Config("config-file.txt")
    sourcedirList = cfg.sourceDirectories
    targetdir = cfg.targetDirectory
    path = os.path.normpath(targetdir)
    folder_list = iter(path.split(os.sep))
    for folder in folder_list:
        if folder == "Volumes":
            drive = next(folder_list)
            drive = "/Volumes/" + drive
            break
    if (os.path.exists(drive)):
        for sourcedir in sourcedirList:
            if (os.path.isdir(sourcedir)):
                tail = getPathTail(sourcedir)
                newtargetdir = os.path.join(targetdir, tail)
                # print(newtargetdir)
                if not os.path.exists(newtargetdir):
                    os.makedirs(newtargetdir)
                options = {"verbose": True}
                sync(sourcedir, newtargetdir, "sync", **options)
    else:
        print("Target disk is not mounted.")
    print("All Done!")
Beispiel #6
0
    def test_wrap_async(self):
        async def a():
            return 1

        b = sync(a)
        self.assertEqual(b(), 1)
        self.assertEqual(sync(a()), 1)
Beispiel #7
0
 def tearDown(self):
     if os.path.exists(self.tmp):
         os.remove(self.tmp)
     if self.proc.returncode is None:
         self.proc.terminate()
     sync(self.wait(times=10))
     super().tearDown()
Beispiel #8
0
    def test_func_error(self):
        def a():
            return 1

        with self.assertRaises(TypeError):
            sync(a)
        with self.assertRaises(TypeError):
            sync(a())
Beispiel #9
0
    def test_wrap_aioco(self):
        @asyncio.coroutine
        def a():
            yield from asyncio.sleep(0)
            return 1

        b = sync(a)
        self.assertEqual(b(), 1)
        self.assertEqual(sync(a()), 1)
Beispiel #10
0
 def setUp(self):
     self.doc = get_document()
     self.doc.body.appendChild(self.app())
     self.server = start_server(port=0)
     self.addr = server_config['address']
     self.port = server_config['port']
     self.url = 'http://{}:{}/'.format(self.addr, self.port)
     self.page = page
     sync(self.page.goto(self.url))
Beispiel #11
0
 def spider_closed(self, spider):
     """Shutdown the driver when spider is closed"""
     PrintFormatUtil.print_line("spider {} : 结束处理".format(spider.name))
     if spider.crawl_type.value == 'selenium' and not self.driver is None:
         PrintFormatUtil.print_line("spider {} : selenium driver 销毁".format(spider.name))
         self.driver.close()
         self.driver.quit()
     if spider.crawl_type.value == 'puppeeter' and not self.driver is None:
         PrintFormatUtil.print_line("spider {} : puppeeter driver 销毁".format(spider.name))
         sync(self.driver.close())
Beispiel #12
0
    def wait_for_timeout(self, milliseconds: int) -> None:
        """
        Waits for timeout.

        :param milliseconds: the time to wait for (in milliseconds)
        """

        self._check_if_crawler_running()

        syncer.sync(self._page.waitFor(milliseconds))
Beispiel #13
0
    def test_built_query_getter_with_exception_query(self, mocked):
        exist = self.EM()
        mocked.get(
            'http://127.0.0.1:8080/exist/apps/testapp/test1.xql?thing=bosh',
            status=200,
            body=ERROR_XML)

        with pytest.raises(ExistQueryExceptionError) as err:
            sync(exist.test1)(thing='bosh')

        str(err.value) | should.be.equal.to("[FAKEEXISTERRMSG]")
Beispiel #14
0
    def delete_cookie(self, cookie: Cookie) -> None:
        """
        Deletes the given cookie.

        :param cookie: the cookie to delete
        :raise CrawlerNotRunningError: if the crawler is not running
        """

        self._check_if_crawler_running()

        syncer.sync(self._page.deleteCookie(cookie.as_dict()))
Beispiel #15
0
 def setUp(self):
     from syncer import sync
     super().setUp()
     self.doc = get_document()
     self.root = self.get_elements()
     self.doc.body.prepend(self.root)
     self.server = server.start_server(port=0)
     self.address = server_config['address']
     self.port = server_config['port']
     self.url = 'http://{}:{}'.format(self.address, self.port)
     sync(self.page.goto(self.url))
     self.element = sync(self.get_element_handle(self.root))
Beispiel #16
0
async def main() -> None:
    browsers = []
    for i in range(3):
        browser = await launch(headless=False, args=['--no-sandbox'])
        # print(browser.wsEndpoint, flush=True)
        # endpoint = browser.wsEndpoint
        # browser1 = await connect(browserWSEndpoint=endpoint)
        browsers.append(browser)

    # loop = asyncio.get_event_loop()
    for browser in browsers:
        sync(test(browser))
    def parse(self, response):
        service_pic_path = os.path.join(CONST.PIC_PATH, response.meta['r_dict']['title'])
        os.makedirs(service_pic_path, exist_ok=True)
        current_time = str(int(time.time()))
        service_pic_name = os.path.join(service_pic_path, current_time + ".png")
        service_pic_small_name = os.path.join(service_pic_path, current_time + "_s.png")
        service_pic_diff_name = os.path.join(service_pic_path, current_time + "_diff.png")
        service_pic_oc_diff_name = os.path.join(service_pic_path, current_time + "_oc_diff.png")
        PrintFormatUtil.print_line("pic save path {}".format(service_pic_name))
        sync(response.meta['page'].screenshot({'path': service_pic_name, 'fullPage': True}))

        # 设置图片压缩
        image = Image.open(service_pic_name)
        w,h  = image.size
        PrintFormatUtil.print_line("原有图片大小 width {} height {}".format(w,h))
        d_img = image.resize((int(w/2),int(h/2)),Image.ANTIALIAS)
        w, h = d_img.size
        PrintFormatUtil.print_line("处理后的图片大小 width {} height {}".format(w, h))
        d_img.save(service_pic_small_name, quality=95)
        del response, image

        # 读取latest文件
        latest_path = os.path.join(service_pic_path, 'latest')
        if os.path.exists(latest_path) and os.path.isfile(latest_path):
            with open(latest_path, 'r') as f:
                old_file_info = f.read()
            old_file_info = old_file_info.split(" ")
            old_file_info_name = old_file_info[0]
            old_file_info_md5 = old_file_info[1]
            old_service_pic_name = os.path.join(service_pic_path, old_file_info_name)
            PrintFormatUtil.print_line("old pic path {}".format(old_service_pic_name))
            if old_file_info_md5 == FileUtil.get_md5(old_service_pic_name):
                PrintFormatUtil.print_line("比对图片 {} | {}".format(service_pic_small_name, old_file_info_name))
                # 比对图片(PIL和OPENCV两种模式)
                iss = ImageSSIM(service_pic_small_name, old_service_pic_name, service_pic_diff_name)
                o_iss = OpenCVSSIM(service_pic_small_name, old_service_pic_name, service_pic_oc_diff_name)
                pil_s_code = iss.compare_images()
                oc_s_code = o_iss.compare_images()
                PrintFormatUtil.print_line("PIL库两者的相似度: {}".format(pil_s_code))
                PrintFormatUtil.print_line("OPEN_CV库两者的相似度: {}".format(oc_s_code))
                # 这个值可以设置(0-1),  1 非常严格
                if pil_s_code < 1 and oc_s_code < 1:
                    iss.output_diff()
                    o_iss.output_diff()
            else:
                PrintFormatUtil.print_line("old pic md5 error. new {} old {}".format(
                    FileUtil.get_md5(old_service_pic_name),old_file_info_md5))
        # 重新生成latest文件
        with open(latest_path, "w") as file:
            file.write(os.path.basename(service_pic_small_name) + " " + FileUtil.get_md5(service_pic_small_name))
Beispiel #18
0
    def get_pages(self) -> List[BrowserPage]:
        """
        Returns a list of pages in the browser.

        :return: a list of pages
        :raise CrawlerNotRunningError: if the crawler is not running
        """

        self._check_if_crawler_running()

        return [
            BrowserPage(index, page.url, syncer.sync(page.title()))
            for index, page in enumerate(syncer.sync(self._browser.pages()))
        ]
Beispiel #19
0
    def set_cookie(self, cookie: Cookie) -> None:
        """
        Sets a cookie.

        :param cookie: the cookie to set
        :raise CrawlerNotRunningError: if the crawler is not running
        :raise ValueError: if the cookie cannot be set due to invalid page URL
        """

        self._check_if_crawler_running()

        try:
            syncer.sync(self._page.setCookie(cookie.as_dict()))
        except PageError as error:
            raise ValueError(error)
Beispiel #20
0
    def test_wrap_async_args(self):
        async def a(b, c, d=2):
            return d

        b = sync(a)
        self.assertEqual(b(1, 2), 2)
        self.assertEqual(b(1, 2, 3), 3)
Beispiel #21
0
def iter_file_user_trade_list(user=0):
    if not user:
        from syncer import sync
        sync_get_user = U.get_or_set('sync_get_user',
                                     sync(taobao_trade.get_user))

        user = sync_get_user()
        print('sync_get_user :'******'C:\test\taobao_trade'):
        if user not in f:
            continue
        b = F.read_byte(f)
        d = T.json_loads(b)
        assert d['user'] == user

        for n, t in enumerate(d['ts']):
            e = T.xpath(t, "//a[contains(., '查看物流')]")
            if len(e) != 1:
                t = T.html2text(t)
                t = T.replace_all_space(t)[:99]
                u = py.No(t)
            else:
                e = e[0]
                u = convert_wuliu_url(e.attrib['href'])

            row = [d['user'], d['n'], n, u]
            v.append(row)
    if not v:
        return py.No('can not found user: %s trade_list' % user)
    U.set(user, v)
    return F.dill_dump(obj=v, file=f'{user}-{len(v)}-trade_list')
Beispiel #22
0
 def setUpClass(cls):
     cls.port = get_free_port()
     time.sleep(0.1)
     cls.app = get_application()
     cls.server = cls.app.listen(cls.port)
     cls.browser = launch()
     cls.page = sync(cls.browser.newPage())
Beispiel #23
0
 def test_wrap_method(self):
     class A:
         async def a(self):
             return 1
     a = A()
     sync_a = sync(a.a)
     self.assertEqual(sync_a(), 1)
Beispiel #24
0
    def test_wrap_method(self):
        class A:
            async def a(self):
                return 1

        a = A()
        sync_a = sync(a.a)
        self.assertEqual(sync_a(), 1)
Beispiel #25
0
 def test_wrap_method_args(self):
     class A:
         async def a(self, b, c, d=2):
             return d
     a = A()
     sync_a = sync(a.a)
     self.assertEqual(sync_a(1, 2), 2)
     self.assertEqual(sync_a(1, 2, 3), 3)
Beispiel #26
0
 def setUp(self):
     super().setUp()
     self.port = free_port()
     sync(self.wait(times=3))
     cmd = [sys.executable, '-c', script, '--port',
            str(self.port)] + self.cmd
     self.url = 'http://localhost:{}'.format(self.port)
     self.ws_url = 'ws://localhost:{}/wdom_ws'.format(self.port)
     self.proc = subprocess.Popen(
         cmd,
         cwd=root,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         universal_newlines=True,
         bufsize=0,
     )
     sync(self.wait(times=10))
Beispiel #27
0
    def close_page(self, page: BrowserPage) -> None:
        """
        Closes the given page.
        Note: There must be at least one open page.

        :param page: the page to close
        :raise CrawlerNotRunningError: if the crawler is not running
        :raise ValueError: if the given page is the last one open
        """

        self._check_if_crawler_running()

        pages = syncer.sync(self._browser.pages())
        if len(pages) == 1:
            raise ValueError('Cannot close the last page')

        syncer.sync(pages[page.index].close())
 def send_text_sync(self,
                    channel,
                    text,
                    public=False,
                    team=False,
                    topic_name=None):
     return syncer.sync(
         self.send_text(channel, text, public, team, topic_name))
Beispiel #29
0
    def click(self, selector: str, click_count=1) -> None:
        """
        Clicks element which matches selector.

        :param selector: an element selector
        :param click_count: the number of clicks, defaults to 1
        :raise CrawlerNotRunningError: if the crawler is not running
        :raise NoSuchElementError: if there is no element matching selector
        """

        self._check_if_crawler_running()

        try:
            syncer.sync(
                self._page.click(selector, options={'clickCount':
                                                    click_count}))
        except PageError:
            raise NoSuchElementError(selector)
Beispiel #30
0
    def test_wrap_method_args(self):
        class A:
            async def a(self, b, c, d=2):
                return d

        a = A()
        sync_a = sync(a.a)
        self.assertEqual(sync_a(1, 2), 2)
        self.assertEqual(sync_a(1, 2, 3), 3)
Beispiel #31
0
    def test_wrap_aioco_args(self):
        @asyncio.coroutine
        def a(b, c, d=2):
            yield from asyncio.sleep(0)
            return d

        b = sync(a)
        self.assertEqual(b(1, 2), 2)
        self.assertEqual(b(1, 2, 3), 3)
Beispiel #32
0
 def run(*args, **kwargs):
     try:
         return sync(f(*args, **kwargs))
     except RuntimeError:
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
         res = loop.run_until_complete(f(*args, **kwargs))
         loop.close()
         return res
Beispiel #33
0
    def get_text(self) -> str:
        """
        Returns the text content of this element.

        :return: the text content of this element
        """

        return syncer.sync(self._element_handle.executionContext.evaluate('element => element.textContent',
                                                                          self._element_handle))
Beispiel #34
0
 def setUp(self):
     super().setUp()
     self.port = free_port()
     env = os.environ.copy()
     env['PYTHONPATH'] = root
     _ = tempfile.NamedTemporaryFile(mode='w+', suffix='.py', delete=False)
     with _ as f:
         self.tmp = f.name
         f.write(script)
     cmd = [sys.executable, self.tmp, '--port', str(self.port)] + self.cmd
     self.url = 'http://localhost:{}'.format(self.port)
     self.ws_url = 'ws://localhost:{}/rimo_ws'.format(self.port)
     self.proc = subprocess.Popen(
         cmd, cwd=curdir, env=env,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         universal_newlines=True,
     )
     sync(self.wait(times=10))
Beispiel #35
0
 def setUp(self):
     super().setUp()
     sync(self.page.goto(self.url + 'empty'))
     self.result = False
Beispiel #36
0
 def tearDown(self):
     sync(self.page.close())
Beispiel #37
0
 def test_wrap_async(self):
     async def a():
         return 1
     b = sync(a)
     self.assertEqual(b(), 1)
     self.assertEqual(sync(a()), 1)
Beispiel #38
0
 def test_wrap_async_args(self):
     async def a(b, c, d=2):
         return d
     b = sync(a)
     self.assertEqual(b(1, 2), 2)
     self.assertEqual(b(1, 2, 3), 3)
Beispiel #39
0
 def tearDown(self):
     if self.target_path.exists:
         self.target_path.unlink()
     sync(self.browser.close())
Beispiel #40
0
 def setUp(self):
     self.browser = sync(launch(args=['--no-sandbox']))
     self.target_path = Path(__file__).resolve().parent / 'test.pdf'
     if self.target_path.exists():
         self.target_path.unlink()
Beispiel #41
0
 def setUp(self):
     self.page = sync(self.browser.newPage())
     self.result = False
Beispiel #42
0
 def tearDownClass(cls):
     sync(cls.browser.close())
     cls.server.stop()
Beispiel #43
0
 def setUpClass(cls):
     cls.port = get_free_port()
     cls.app = get_application()
     cls.server = cls.app.listen(cls.port)
     cls.browser = sync(launch(DEFAULT_OPTIONS))
     cls.url = 'http://localhost:{}/'.format(cls.port)
Beispiel #44
0
 def test_future(self):
     f = asyncio.Future()
     f.set_result(1)
     self.assertEqual(sync(f), 1)
Beispiel #45
0
 def setUp(self) -> None:
     super().setUp()
     self.start()
     sync(self.wait())
     self.ws_url = 'ws://localhost:{}/rimo_ws'.format(self.port)
     self.ws = sync(self.ws_connect(self.ws_url))