コード例 #1
0
ファイル: scheduler.py プロジェクト: jiayunwu/feapder
    def all_thread_is_done(self):
        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
            # 检测 collector 状态
            if (
                self._collector.is_collector_task()
                or self._collector.get_requests_count() > 0
            ):
                return False

            # 检测 parser_control 状态
            for parser_control in self._parser_controls:
                if not parser_control.is_not_task():
                    return False

            # 检测 item_buffer 状态
            if (
                self._item_buffer.get_items_count() > 0
                or self._item_buffer.is_adding_to_db()
            ):
                return False

            # 检测 request_buffer 状态
            if (
                self._request_buffer.get_requests_count() > 0
                or self._request_buffer.is_adding_to_db()
            ):
                return False

            tools.delay_time(1)

        return True
コード例 #2
0
ファイル: item_buffer.py プロジェクト: yufengsoft/feapder
    def run(self):
        self._thread_stop = False
        while not self._thread_stop:
            self.flush()
            tools.delay_time(1)

        self.close()
コード例 #3
0
ファイル: item_buffer.py プロジェクト: NonAnaconda/feapder
    def run(self):
        self._thread_stop = False
        while not self._thread_stop:
            self.flush()
            tools.delay_time(setting.ITEM_UPLOAD_INTERVAL)

        self.close()
コード例 #4
0
ファイル: guest_user_pool.py プロジェクト: ibryang/feapder
    def run(self):
        while True:
            try:
                now_user_count = self._redisdb.hget_count(self._tab_user_pool)
                need_user_count = self._min_users - now_user_count

                if need_user_count > 0:
                    log.info("当前在线user数为 {} 小于 {}, 生产user".format(
                        now_user_count, self._min_users))
                    try:
                        user = self.login()
                        if user:
                            self.add_user(user)
                    except Exception as e:
                        log.exception(e)
                else:
                    log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count))

                    if self._keep_alive:
                        tools.delay_time(10)
                    else:
                        break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
コード例 #5
0
ファイル: scheduler.py プロジェクト: jiayunwu/feapder
    def run(self):
        if not self.is_reach_next_spider_time():
            return

        self._start()

        while True:
            if self.all_thread_is_done():
                if not self._is_notify_end:
                    self.spider_end()  # 跑完一轮
                    self.record_spider_state(
                        spider_type=1,
                        state=1,
                        spider_end_time=tools.get_current_date(),
                        batch_interval=self._batch_interval,
                    )

                    self._is_notify_end = True

                if self._auto_stop_when_spider_done:
                    self._stop_all_thread()
                    break

            else:
                self._is_notify_end = False

            self.check_task_status()

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
コード例 #6
0
ファイル: scheduler.py プロジェクト: NonAnaconda/feapder
    def run(self):
        if not self.is_reach_next_spider_time():
            return

        self._start()

        while True:
            try:
                self.heartbeat()
                if self.all_thread_is_done():
                    if not self._is_notify_end:
                        self.spider_end()  # 跑完一轮
                        self.record_spider_state(
                            spider_type=1,
                            state=1,
                            spider_end_time=tools.get_current_date(),
                            batch_interval=self._batch_interval,
                        )

                        self._is_notify_end = True

                    if not self._keep_alive:
                        self._stop_all_thread()
                        break

                else:
                    self._is_notify_end = False

                self.check_task_status()

            except Exception as e:
                log.exception(e)

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
コード例 #7
0
ファイル: guest_user_pool.py プロジェクト: ibryang/feapder
    def get_user(self, block=True) -> Optional[GuestUser]:
        """

        Args:
            block: 无用户时是否等待

        Returns:

        """
        while True:
            try:
                user_id = self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)
                    # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id
                    if not user_str:
                        self._load_users_id()
                        continue

                if not user_id and block:
                    self._keep_alive = False
                    with RedisLock(key=self._tab_user_pool,
                                   lock_timeout=3600,
                                   wait_timeout=0) as _lock:
                        if _lock.locked:
                            self.run()
                    continue

                return user_str and GuestUser(**eval(user_str))
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
コード例 #8
0
ファイル: request_buffer.py プロジェクト: zhouxinfei/feapder
    def run(self):
        while not self._thread_stop:
            try:
                self.__add_request_to_db()
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)
コード例 #9
0
ファイル: cookie_pool.py プロジェクト: kopa-kongpan/feapder
    def run(self):
        while True:
            try:
                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
                need_cookie_count = self._min_cookies - now_cookie_count

                if need_cookie_count > 0:
                    log.info(
                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
                            now_cookie_count, self._min_cookies
                        )
                    )
                    try:
                        cookies = self.create_cookie()
                        if cookies:
                            self.add_cookies(cookies)
                    except Exception as e:
                        log.exception(e)
                else:
                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))

                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
                    last_count_info = self._redisdb.strget(
                        self._tab_cookie_pool_last_count
                    )
                    if not last_count_info:
                        self._redisdb.strset(
                            self._tab_cookie_pool_last_count,
                            "{}:{}".format(time.time(), now_cookie_count),
                        )
                    else:
                        last_time, last_count = last_count_info.split(":")
                        last_time = float(last_time)
                        last_count = int(last_count)

                        if time.time() - last_time > 60:
                            if now_cookie_count == last_count:
                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
                                break
                            else:
                                self._redisdb.strset(
                                    self._tab_cookie_pool_last_count,
                                    "{}:{}".format(time.time(), now_cookie_count),
                                )

                    if self._keep_alive:
                        log.info("sleep 10")
                        tools.delay_time(10)
                    else:
                        break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
コード例 #10
0
    def run(self):
        """
        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
        ---------
        ---------
        @result:
        """
        try:
            self.create_batch_record_table()

            if not self._parsers:  # 不是add_parser 模式
                self._parsers.append(self)

            self._start()

            while True:
                try:
                    self.heartbeat()
                    if (
                            self.task_is_done() and self.all_thread_is_done()
                    ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                        if not self._is_notify_end:
                            self.spider_end()
                            self.record_spider_state(
                                spider_type=2,
                                state=1,
                                batch_date=self._batch_date_cache,
                                spider_end_time=tools.get_current_date(),
                                batch_interval=self._batch_interval,
                            )

                            self._is_notify_end = True

                        if not self._keep_alive:
                            self._stop_all_thread()
                            break
                    else:
                        self._is_notify_end = False

                    self.check_task_status()

                except Exception as e:
                    log.exception(e)

                tools.delay_time(10)  # 10秒钟检查一次爬虫状态

        except Exception as e:
            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
            log.error(msg)
            self.send_msg(msg,
                          level="error",
                          message_prefix="《%s》爬虫异常结束".format(self._batch_name))

            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
コード例 #11
0
ファイル: spider.py プロジェクト: ORG-MARS/feapder
    def run(self):
        if not self._parsers:  # 不是add_parser 模式
            self._parsers.append(self)

        self._start()

        while True:
            if self.all_thread_is_done():
                self._stop_all_thread()
                break

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态

        self.delete_tables([self._redis_key + "*"])
コード例 #12
0
ファイル: air_spider.py プロジェクト: tikazyq/feapder
    def all_thread_is_done(self):
        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
            # 检测 parser_control 状态
            for parser_control in self._parser_controls:
                if not parser_control.is_not_task():
                    return False

            # 检测 任务队列 状态
            if not self._memory_db.empty():
                return False

            tools.delay_time(1)

        return True
コード例 #13
0
    def run(self):
        while True:
            try:
                try:
                    with RedisLock(
                        key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0
                    ) as _lock:
                        if _lock.locked:
                            for user in self._load_user():
                                retry_times = 0
                                while retry_times <= self._login_retry_times:
                                    try:
                                        user = self.login(user)
                                        if user:
                                            self.add_user(user)
                                        else:
                                            self.handle_login_failed_user(user)
                                        break
                                    except NotImplementedError:
                                        log.error(
                                            f"{self.__class__.__name__} must be implementation login method!"
                                        )
                                        os._exit(0)
                                    except Exception as e:
                                        self.handel_exception(e)
                                    log.debug(
                                        f"login failed, user: {user} retry_times: {retry_times}"
                                    )
                                    retry_times += 1
                                else:
                                    self.handle_login_failed_user(user)

                            now_user_count = self._redisdb.hget_count(
                                self._tab_user_pool
                            )
                            log.info("当前在线user数为 {}".format(now_user_count))

                except Exception as e:
                    log.exception(e)

                if self._keep_alive:
                    tools.delay_time(10)
                else:
                    break

            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
コード例 #14
0
ファイル: cookie_pool.py プロジェクト: kopa-kongpan/feapder
    def get_cookie(self, wait_when_null=True) -> User:
        while True:
            try:
                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
                if not user_cookie and wait_when_null:
                    log.info("暂无cookie 生产中...")
                    self.login()
                    continue

                if user_cookie:
                    user_cookie = eval(user_cookie)
                    return User(**user_cookie)

                return None
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
コード例 #15
0
ファイル: cookie_pool.py プロジェクト: kopa-kongpan/feapder
 def get_cookie(self, wait_when_null=True):
     while True:
         try:
             cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
             if not cookie_info and wait_when_null:
                 log.info("暂无cookie 生产中...")
                 self._keep_alive = False
                 self._min_cookies = 1
                 with RedisLock(
                     key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
                 ) as _lock:
                     if _lock.locked:
                         self.run()
                 continue
             return eval(cookie_info) if cookie_info else {}
         except Exception as e:
             log.exception(e)
             tools.delay_time(1)
コード例 #16
0
ファイル: batch_spider.py プロジェクト: yufengsoft/feapder
    def run(self):
        self.start_monitor_task()

        if not self._parsers:  # 不是add_parser 模式
            self._parsers.append(self)

        self._start()

        while True:
            try:
                if self.all_thread_is_done():
                    self._stop_all_thread()
                    break

            except Exception as e:
                log.exception(e)

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态

        self.delete_tables([self._redis_key + "*"])
コード例 #17
0
ファイル: air_spider.py プロジェクト: NonAnaconda/feapder
    def run(self):
        self.start_callback()

        for i in range(self._thread_count):
            parser_control = AirSpiderParserControl(self._memory_db,
                                                    self._item_buffer)
            parser_control.add_parser(self)
            parser_control.start()
            self._parser_controls.append(parser_control)

        self._item_buffer.start()

        self.distribute_task()

        while True:
            try:
                if self.all_thread_is_done():
                    # 停止 parser_controls
                    for parser_control in self._parser_controls:
                        parser_control.stop()

                    # 关闭item_buffer
                    self._item_buffer.stop()

                    # 关闭webdirver
                    if Request.webdriver_pool:
                        Request.webdriver_pool.close()

                    log.info("无任务,爬虫结束")
                    break

            except Exception as e:
                log.exception(e)

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态

        self.end_callback()
        # 为了线程可重复start
        self._started.clear()
        # 关闭打点
        metrics.close()
コード例 #18
0
    def get_user(self, block=True) -> Optional[NormalUser]:
        while True:
            try:
                user_id = self._get_user_id()
                user_str = None
                if user_id:
                    user_str = self._redisdb.hget(self._tab_user_pool, user_id)
                    # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id
                    if not user_str:
                        self._load_users_id()
                        continue

                if not user_id and block:
                    self._keep_alive = False
                    self.run()
                    continue

                return user_str and NormalUser(**eval(user_str))
            except Exception as e:
                log.exception(e)
                tools.delay_time(1)
コード例 #19
0
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault("timeout", 22)  # connect=22 read=22

        # 设置stream
        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
        self.requests_kwargs.setdefault("stream", True)

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update(
                    {"User-Agent": self.__class__.user_agent_pool.get()})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers", {"User-Agent": setting.DEFAULT_USERAGENT})

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool:
            while True:
                proxies = self.__class__.proxies_pool.get()
                if proxies:
                    self.requests_kwargs.update(proxies=proxies)
                    break
                else:
                    log.debug("暂无可用代理 ...")

        log.debug("""
                -------------- %srequest for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            "" if not self.parser_name else "%s.%s " % (
                self.parser_name,
                (self.callback and callable(self.callback) and getattr(
                    self.callback, "__name__") or self.callback) or "parse",
            ),
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if self.render:
            browser = self._webdriver_pool.get()

            try:
                browser.get(self.url)
                render_time = setting.WEBDRIVER.get("render_time", 0)
                if render_time:
                    tools.delay_time(render_time)

                html = browser.page_source
                response = Response.from_dict({
                    "url": browser.current_url,
                    "cookies": browser.cookies,
                    "text": html,
                    "_content": html.encode(),
                    "status_code": 200,
                    "elapsed": 666,
                    "headers": {
                        "User-Agent":
                        browser.execute_script("return navigator.userAgent")
                    },
                })

                response._cached_text = html
                # response.browser = browser # 因为浏览器渲染完就释放了,所以不能绑定到response上
                self._webdriver_pool.put(browser)
            except Exception as e:
                self._webdriver_pool.remove(browser)
                raise e

        elif use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
            response = Response(response)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)
            response = Response(response)

        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response
コード例 #20
0
    def check_batch(self, is_first_check=False):
        """
        @summary: 检查批次是否完成
        ---------
        @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了
        ---------
        @result: 完成返回True 否则False
        """

        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )
        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)

        if batch_info:
            batch_date, total_count, done_count = batch_info[0]

            now_date = datetime.datetime.now()
            last_batch_date = datetime.datetime.strptime(
                batch_date, self._date_format)
            time_difference = now_date - last_batch_date

            if total_count == done_count and time_difference < datetime.timedelta(
                    days=self._batch_interval):  # 若在本批次内,再次检查任务表是否有新增任务
                # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新
                task_count = self.__get_task_state_count()

                total_count = task_count.get("total_count")
                done_count = task_count.get("done_count")

            if total_count == done_count:
                # 检查相关联的爬虫是否完成
                releated_spider_is_done = self.related_spider_is_done()
                if releated_spider_is_done == False:
                    msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format(
                        self._batch_name,
                        self._related_batch_record
                        or self._related_task_tables,
                        batch_date,
                        done_count,
                        total_count,
                    )
                    log.info(msg)
                    # 检查是否超时 超时发出报警
                    if time_difference >= datetime.timedelta(
                            days=self._batch_interval):  # 已经超时
                        if (not self._last_send_msg_time
                                or now_date - self._last_send_msg_time >=
                                self._send_msg_interval):
                            self._last_send_msg_time = now_date
                            self.send_msg(msg, level="error")

                    return False

                elif releated_spider_is_done == True:
                    # 更新is_done 状态
                    self.update_is_done()

                else:
                    self.update_is_done()

                msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format(
                    self._batch_name, batch_date, done_count)
                log.info(msg)
                if not is_first_check:
                    self.send_msg(msg)

                # 判断下一批次是否到
                if time_difference >= datetime.timedelta(
                        days=self._batch_interval):
                    msg = "《{}》下一批次开始".format(self._batch_name)
                    log.info(msg)
                    self.send_msg(msg)

                    # 初始化任务表状态
                    if self.init_task() != False:  # 更新失败返回False 其他返回True/None
                        # 初始化属性
                        self.init_property()

                        is_success = (
                            self.record_batch()
                        )  # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置
                        if is_success:
                            log.info(
                                "插入新批次记录成功 1分钟后开始下发任务")  # 防止work批次时间没来得及更新
                            tools.delay_time(60)

                            return False  # 下一批次开始

                        else:
                            return True  # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务

                else:
                    log.info("《{}》下次批次时间未到".format(self._batch_name))
                    if not is_first_check:
                        self.send_msg("《{}》下次批次时间未到".format(self._batch_name))
                    return True

            else:
                if time_difference >= datetime.timedelta(
                        days=self._batch_interval):  # 已经超时
                    time_out = time_difference - datetime.timedelta(
                        days=self._batch_interval)
                    time_out_pretty = tools.format_seconds(
                        time_out.total_seconds())

                    msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format(
                        self._batch_name,
                        time_out_pretty,
                        batch_date,
                        done_count,
                        total_count,
                    )
                    if self._batch_interval >= 1:
                        msg += ", 期望时间{}天".format(self._batch_interval)
                    else:
                        msg += ", 期望时间{}小时".format(self._batch_interval * 24)

                    result = self.get_deal_speed(
                        total_count=total_count,
                        done_count=done_count,
                        last_batch_date=last_batch_date,
                    )
                    if result:
                        deal_speed, need_time, overflow_time, calculate_speed_time = (
                            result)
                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
                            calculate_speed_time,
                            deal_speed,
                            tools.format_seconds(need_time),
                        )

                        if overflow_time > 0:
                            msg += ", 该批次预计总超时 {}, 请及时处理".format(
                                tools.format_seconds(overflow_time))

                    log.info(msg)

                    if (not self._last_send_msg_time
                            or now_date - self._last_send_msg_time >=
                            self._send_msg_interval):
                        self._last_send_msg_time = now_date
                        self.send_msg(msg, level="error")

                else:  # 未超时
                    remaining_time = (
                        datetime.timedelta(days=self._batch_interval) -
                        time_difference)
                    remaining_time_pretty = tools.format_seconds(
                        remaining_time.total_seconds())

                    if self._batch_interval >= 1:
                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format(
                            self._batch_name,
                            batch_date,
                            done_count,
                            total_count,
                            self._batch_interval,
                            remaining_time_pretty,
                        )
                    else:
                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format(
                            self._batch_name,
                            batch_date,
                            done_count,
                            total_count,
                            self._batch_interval * 24,
                            remaining_time_pretty,
                        )

                    result = self.get_deal_speed(
                        total_count=total_count,
                        done_count=done_count,
                        last_batch_date=last_batch_date,
                    )
                    if result:
                        deal_speed, need_time, overflow_time, calculate_speed_time = (
                            result)
                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
                            calculate_speed_time,
                            deal_speed,
                            tools.format_seconds(need_time),
                        )

                        if overflow_time > 0:
                            msg += ", 该批次可能会超时 {}, 请及时处理".format(
                                tools.format_seconds(overflow_time))
                            # 发送警报
                            if (not self._last_send_msg_time
                                    or now_date - self._last_send_msg_time >=
                                    self._send_msg_interval):
                                self._last_send_msg_time = now_date
                                self.send_msg(msg, level="error")

                        elif overflow_time < 0:
                            msg += ", 该批次预计提前 {} 完成".format(
                                tools.format_seconds(-overflow_time))

                    log.info(msg)

        else:
            # 插入batch_date
            self.record_batch()

            # 初始化任务表状态 可能有产生任务的代码
            self.init_task()

            return False
コード例 #21
0
def run():
    while True:
        redisdb = RedisDB()
        try:
            block_ip = redisdb.sget(setting.CAPTCHA_BLOCK_IP_REDIS_KEY)
            if not block_ip:
                log.debug("暂无被封ip")
            for ip in block_ip:
                task = redisdb.hget(setting.CAPTCHA_REDIS_KEY, ip, is_pop=True)
                task = eval(task)
                ua = task.get("ua")
                url = task.get("url")

                with WebDriver(proxy=ip, user_agent=ua) as browser:
                    log.info("解封ip {}, url {}".format(ip, url))
                    browser.get(url)
                    browser.implicitly_wait(5)
                    frame = browser.find_element_by_id("tcaptcha_iframe")
                    browser.switch_to.frame(frame)
                    for i in range(20):
                        for i in range(1000):
                            bg_url = browser.find_element_by_id(
                                "slideBg").get_attribute("src")
                            slide_url = browser.find_element_by_id(
                                "slideBlock").get_attribute("src")
                            if bg_url and slide_url:
                                break
                        else:
                            log.error("滑块加载失败")
                            return

                        bg_image = os.path.join(
                            CAPTCHA_PATH,
                            "bg_" + tools.get_md5(bg_url) + ".png")
                        slide_image = os.path.join(
                            CAPTCHA_PATH,
                            "slider_" + tools.get_md5(slide_url) + ".png")
                        if tools.download_file(
                                bg_url, bg_image) and tools.download_file(
                                    slide_url, slide_image):
                            # 识别缺口
                            x, y = get_gap_center_point(bg_image,
                                                        slide_image,
                                                        show=False)
                            # 缩放
                            x = x * 340 / 680
                            x = x - 27.5 - 30
                            # 滑动
                            slide_btn = browser.find_element_by_id(
                                "tcaptcha_drag_thumb")
                            tracks = track.get_tracks(x)
                            drag_and_drop(browser, slide_btn, tracks)
                            # 删除图片
                            os.remove(bg_image)
                            os.remove(slide_image)

                            tools.delay_time(2)
                            if "verify.maoyan.com" not in browser.current_url:
                                log.info("解封成功")
                                break
                            else:
                                try:
                                    browser.find_element_by_css_selector(
                                        ".tc-action-icon").click()
                                except:
                                    pass
            tools.delay_time(3)
        except Exception as e:
            log.error(e)
コード例 #22
0
ファイル: item_buffer.py プロジェクト: ybqdren/feapder
    def run(self):
        while not self._thread_stop:
            self.flush()
            tools.delay_time(0.5)

        self.close()
コード例 #23
0
ファイル: request.py プロジェクト: yufengsoft/feapder
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault(
            "timeout", setting.REQUEST_TIMEOUT)  # connect=22 read=22

        # 设置stream
        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
        self.requests_kwargs.setdefault("stream", True)

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
                ua = setting.WEBDRIVER.get(
                    "user_agent") or self.__class__.user_agent_pool.get(
                        setting.USER_AGENT_TYPE)
            else:
                ua = self.__class__.user_agent_pool.get(
                    setting.USER_AGENT_TYPE)

            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update({"User-Agent": ua})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers", {"User-Agent": setting.DEFAULT_USERAGENT})

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
            while True:
                proxies = self._proxies_pool.get()
                if proxies:
                    self.requests_kwargs.update(proxies=proxies)
                    break
                else:
                    log.debug("暂无可用代理 ...")

        log.debug("""
                -------------- %srequest for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            "" if not self.parser_name else "%s.%s " % (
                self.parser_name,
                (self.callback and callable(self.callback) and getattr(
                    self.callback, "__name__") or self.callback) or "parse",
            ),
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if self.render:
            # 使用request的user_agent、cookies、proxy
            user_agent = headers.get("User-Agent") or headers.get("user-agent")
            cookies = self.requests_kwargs.get("cookies")
            if cookies and isinstance(cookies, RequestsCookieJar):
                cookies = cookies.get_dict()

            if not cookies:
                cookie_str = headers.get("Cookie") or headers.get("cookie")
                if cookie_str:
                    cookies = tools.get_cookies_from_str(cookie_str)

            proxy = None
            if proxies and proxies != -1:
                proxy = proxies.get("http",
                                    "").strip("http://") or proxies.get(
                                        "https", "").strip("https://")

            browser = self._webdriver_pool.get(user_agent=user_agent,
                                               proxy=proxy)

            url = self.url
            if self.requests_kwargs.get("params"):
                url = tools.joint_url(self.url,
                                      self.requests_kwargs.get("params"))

            try:
                browser.get(url)
                if cookies:
                    browser.cookies = cookies
                if self.render_time:
                    tools.delay_time(self.render_time)

                html = browser.page_source
                response = Response.from_dict({
                    "url": browser.current_url,
                    "cookies": browser.cookies,
                    "_content": html.encode(),
                    "status_code": 200,
                    "elapsed": 666,
                    "headers": {
                        "User-Agent":
                        browser.execute_script("return navigator.userAgent"),
                        "Cookie":
                        tools.cookies2str(browser.cookies),
                    },
                })

                response.browser = browser
            except Exception as e:
                self._webdriver_pool.remove(browser)
                raise e

        elif use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
            response = Response(response)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)
            response = Response(response)

        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response