コード例 #1
0
    def insert_one(self, item: ProxyDO):
        if isinstance(item, ProxyDO):
            session = self.session()

            try:
                m = session.query(ProxyDO).filter(
                    ProxyDO.ip == item.ip).first()
                if m is None:  # 插入数据
                    session.add(item)
                    session.flush()
                else:
                    m.origin = item.origin
                    m.update_time = item.update_time
                    m.failed_count = item.failed_count
                    m.response_speed = item.response_speed
                    m.validity = item.validity

                session.commit()

            except Exception as error:
                session.rollback()
                utils.log(error)
                raise
            finally:
                session.close()
コード例 #2
0
    def check_ip_availability_task(self):
        last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME)
        now_time = datetime.utcnow().timestamp()
        if last_check_time is not None and (
                now_time - float(last_check_time)) < (TASK_INTERVAL * 60):
            return
        self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time)

        proxy_list = self.collection.find()
        for proxy in proxy_list:
            ip = proxy['ip']
            start_time = time.time()
            response = utils.http_request('http://lwons.com/wx', timeout=10)
            is_success = response.status_code == 200
            response.close()
            if not is_success:
                try:
                    self.collection.delete_one({'ip': ip})
                except:
                    pass
                utils.log('Check ip %s FAILED' % ip)
            else:
                elapsed = round(time.time() - start_time, 4)
                try:
                    self.collection.update_one({'ip': ip}, {
                        "$set": {
                            'update_time': utils.get_utc_time(),
                            'response_speed': elapsed,
                            'validity': True
                        }
                    })
                except:
                    pass
                utils.log('Check ip %s SUCCESS' % ip)
コード例 #3
0
    def check_ip_availability_task(self):
        # redis获取上次自检时间,如果未达到设定时间则不在检查
        last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME)
        now_time = datetime.utcnow().timestamp()
        if last_check_time is not None and (
                now_time - float(last_check_time)) < (TASK_INTERVAL * 60):
            return
        self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time)

        proxy_list = self.db.find_all()
        for proxy in proxy_list:
            ip = proxy.ip
            start_time = time.time()
            # 这个自己机制就是通过代理ip来ping数据量很小的网站。如果ping失败了则直接删除该ip
            response = utils.http_request('http://www.baidu.com', timeout=10)
            is_success = response.status_code == 200
            response.close()
            if not is_success:
                # 如果请求失败,直接删除IP
                try:
                    self.db.delete_one(ip)
                except:
                    pass
                utils.log('Check ip %s FAILED' % ip)
            else:
                # 如果请求成功,在数据库中记录该ip最后响应的时间,下次取ip时优先取出使用
                elapsed = round(time.time() - start_time, 4)
                try:
                    proxy.update_time = utils.get_utc_time()
                    proxy.response_speed = elapsed
                    proxy.validity = 1
                    self.db.insert_one(proxy)
                except:
                    pass
                utils.log('Check ip %s SUCCESS' % ip)
コード例 #4
0
 def count(self):
     session = self.session()
     try:
         count = session.query(ProxyDO).count()
         return count
     except Exception as error:
         utils.log(error)
         raise
     finally:
         session.close()
コード例 #5
0
 def detele_all(self):
     session = self.session()
     try:
         session.query(ProxyDO).delete()
         session.commit()
     except Exception as error:
         session.rollback()
         utils.log(error)
         raise
     finally:
         session.close()
コード例 #6
0
 def detele_one(self, ip):
     session = self.session()
     try:
         session.query(ProxyDO).filter(ProxyDO.ip == ip).delete()
         session.commit()
     except Exception as error:
         session.rollback()
         utils.log(error)
         raise
     finally:
         session.close()
コード例 #7
0
    def find_one(self, ip: str):
        session = self.session()

        try:
            m = session.query(ProxyDO).filter(ProxyDO.ip == ip).first()
            return m
        except Exception as error:
            session.rollback()
            utils.log(error)
            raise
        finally:
            session.close()
コード例 #8
0
def db_connect_engine():
    utils.log('db_connect_engine')
    engine = create_engine(
        "%s://%s:%s@%s:%s/%s?charset=utf8mb4" %
        (DATABASES['DRIVER'], DATABASES['USER'], DATABASES['PASSWORD'],
         DATABASES['HOST'], DATABASES['PORT'], DATABASES['NAME']),
        echo=False)
    try:
        if not database_exists(engine.url):
            create_database(engine.url)  # 创建库
            Base.metadata.create_all(engine)  # 创建表
    except Exception as e:
        log.error(e)
    return engine
コード例 #9
0
    def _thread_check_ip(self, proxy):
        with lock:
            ip = proxy.ip
        start_time = time.time()
        proxy.last_use_time = utils.get_utc_date()
        proxies = {
            "http": "http://" + ip,
            "https": "http://" + ip,
        }
        try:
            response = utils.http_request('https://google.com',
                                          timeout=5,
                                          proxies=proxies)
            with lock:
                proxy.external_validity = response.status_code == 200
                proxy.used_count = proxy.used_count + 1
                proxy.external_response_speed = round(time.time() - start_time,
                                                      4) * 1000
            response.close()
        except (KeyboardInterrupt):
            exit()
        except:
            with lock:
                proxy.external_validity = False
                proxy.external_response_speed = -1

        start_time = time.time()
        try:
            response = utils.http_request('https://www.baidu.com',
                                          timeout=5,
                                          proxies=proxies)
            with lock:
                proxy.internal_validity = response.status_code == 200
                proxy.used_count = proxy.used_count + 1
                proxy.internal_response_speed = round(time.time() - start_time,
                                                      4) * 1000
            response.close()
        except (KeyboardInterrupt):
            exit()
        except:
            with lock:
                proxy.internal_validity = False
                proxy.internal_response_speed = -1
        with lock:
            utils.log('Check IP:' + ip + ' finished i:' +
                      str(proxy.internal_validity) + ' e:' +
                      str(proxy.external_validity))
            self.calc_proxy_weight(proxy)
            self.session.commit()
コード例 #10
0
    def find_all(self):
        session = self.session()

        try:
            m = session.query(ProxyDO).order_by(
                ProxyDO.failed_count.asc(), ProxyDO.validity.desc(),
                ProxyDO.response_speed.asc(),
                ProxyDO.update_time.desc()).all()
            return m
        except Exception as error:
            session.rollback()
            utils.log(error)
            raise
        finally:
            session.close()
コード例 #11
0
 def crawl_proxy_task(self, check_num: bool = True):
     if check_num:
         count = self.collection.count()
         if count > MIN_PROXY_COUNT:
             return
     utils.log("开始抓取代理")
     proxy_list = proxy_strategy.crawl_proxy()
     utils.log("开始保存")
     for proxy in proxy_list:
         if not self.collection.find_one({'ip': proxy.ip}):
             self.collection.insert_one(proxy.__dict__)
             utils.log('保存了:' + proxy.ip)
     utils.log("保存结束")
コード例 #12
0
 def crawl_proxy_task(self, check_num: bool = True):
     if check_num:
         count = self.db.count()
         if count > MIN_PROXY_COUNT:
             return
     utils.log("开始抓取代理")
     proxy_list = proxy_strategy.crawl_proxy()
     utils.log("开始保存")
     for proxy in proxy_list:
         if not self.db.find_one(proxy.ip):
             self.db.insert_one(self.db.convert(proxy))
             utils.log('保存了:' + proxy.ip)
     utils.log("保存结束")
コード例 #13
0
 def add_failed_time(self, ip):
     proxy = self.collection.find_one({'ip': ip})
     if proxy is not None:
         failed_count = proxy['failed_count'] + 1
         utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count))
         if failed_count <= FAILED_COUNT_BORDER:
             try:
                 self.collection.update_one({'ip': ip}, {
                     "$set": {
                         'update_time': utils.get_utc_time(),
                         'failed_count': failed_count
                     }
                 })
             except:
                 pass
         else:
             try:
                 self.collection.delete_one({'ip': ip})
             except:
                 pass
     self.crawl_proxy_task()
コード例 #14
0
 def add_failed_time(self, ip):
     proxy = self.db.find_one(ip)
     if proxy is not None:
         failed_count = proxy.failed_count + 1
         utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count))
         if failed_count <= FAILED_COUNT_BORDER:
             # 如果未达到最大失败次数,则在数据库中添加一次失败
             try:
                 proxy.update_time = utils.get_utc_time()
                 proxy.failed_count = failed_count
                 self.db.insert_one(proxy)
             except:
                 pass
         else:
             # 达到最大失败次数,则在数据库中删除
             try:
                 self.db.detele_one(ip)
             except:
                 pass
     # 检查数据库中IP是否足够
     self.crawl_proxy_task()
コード例 #15
0
 def check_ip_availability_task(self, time):
     need_update_date = utils.get_utc_date(-time)
     proxy_list = self.session.query(Proxy).filter(
         Proxy.last_use_time < need_update_date).all()
     utils.log('Start check count:' + str(len(proxy_list)))
     self._check_ip_availability_task(proxy_list)