Ejemplo n.º 1
0
class ProxyCheck(Utility):
    def __init__(self):
        self.redis = RedisClient()
        self.valid = IpValidation()

    def check_num(self):
        valid_num = self.redis.count(VALIDATED_SCORE,
                                     VALIDATED_SCORE,
                                     name=PROXY_VALIDATED)
        if valid_num < VALIDATED_PROXY_NUM:
            run()

    def init_score(self):
        start, end = DISCARD_SCORE, '+inf'
        length = self.redis.count(start, end, name=PROXY_VALIDATED)
        while length > 0:
            result = self.redis.get_proxy_by_score(start, end, 1000)
            for ip in result:
                self.redis.db.zadd(PROXY_VALIDATED, {ip: INITIAL_SCORE})
            start += 1000
            length = self.redis.count(start, end)
        logger.info('initiation finished')

    def check_valid(self):
        settings.SPIDER_RUNNING = False
        self.init_score()
        self.valid.run_validation(key=PROXY_VALIDATED)
        self.check_num()
Ejemplo n.º 2
0
 def exists_proxy(self):
     """
     代理是否存在
     :return: bool
     """
     self.redis = RedisClient()
     return self.redis.exists(CLIENT_NAME)
Ejemplo n.º 3
0
class GetProxy:
    def __init__(self):
        self.redis = RedisClient()

    def clear_old_key(self):
        min_, max_ = '-inf', '+inf'
        length = self.redis.count(min_, max_, name=PROXY_FOR_USE)
        if length > 0:
            self.redis.db.zremrangebyrank(PROXY_FOR_USE, 0, -1)

    def init_redis_key(self):
        self.clear_old_key()
        min_, max_ = '-inf', '+inf'
        length = self.redis.count(min_, max_)
        if length > 0:
            self.redis.db.zunionstore(PROXY_FOR_USE, [PROXY_VALIDATED])

    def get_proxy(self):
        min_, max_ = '-inf', '+inf'
        length = self.redis.count(min_, max_, name=PROXY_FOR_USE)
        if length > 0:
            proxy = self.redis.get_proxy_by_score(
                min_, max_, 1, key=PROXY_FOR_USE)[0].split('-')[1]
            return proxy
        else:
            raise Exception('no proxy to use')
Ejemplo n.º 4
0
class Test_ip(object):
    def __init__(self):
        self.db = RedisClient()
        self.headers = headers
        self.url = test_url
    def get_url(self,proxy):
        try:
            con =  requests.get(self.url,headers = self.headers,proxies = proxy)
            if con.status_code==200:
                return True
            else:
                return False
        except:
            return False
    def test(self,ip):
        ip = ip.decode('utf-8')
        proxy = {'http':'http://'+ip}
        test_result = self.get_url(proxy)
        if test_result:
            self.db.max(ip,)
        else:
            self.db.decrease(ip)
    def run(self):
        proxies = self.db.all()
        for i in range(len(proxies)):
            ip = proxies[i]
            t = threading.Thread(target=self.test,args=(ip,))
            t.setDaemon(True)
            t.start()
            random_time()
            if i%100==0:
                time.sleep(5)
Ejemplo n.º 5
0
 def __init__(self, site, accounts_pool_size, single_cycle_limit):
     self.site = site
     # 账号池数量上限
     self.accounts_pool_size = accounts_pool_size
     # 单轮注册数量上限
     self.single_cycle_limit = single_cycle_limit
     self.accounts_db = RedisClient('accounts', self.site)
Ejemplo n.º 6
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到代理池数量限制
        :return:
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print("获取器开始执行")
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                #获取代理
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Ejemplo n.º 7
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

##    def run(self):
##        print('获取器开始执行')
##        if not self.is_over_threshold():
##            for callback_label in range(self.crawler.__CrawlFuncCount__):
##                callback = self.crawler.__CrawlFunc__[callback_label]
##                # 获取代理
##                proxies = self.crawler.get_proxies(callback)
##                sys.stdout.flush()
##                for proxy in proxies:
##                    self.redis.add(proxy)

    def run(self):
        print('开始向代理池中添加代理')
        if not self.is_over_threshold():
            proxies = self.crawler.get_proxies()
            sys.stdout.flush()
            for proxy in proxies:
                self.redis.add(proxy)
Ejemplo n.º 8
0
class PoolGetter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
         判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def test_proxy_add(self, proxy):
        """检测是否可用, 可用添加到redis中"""
        # print("proxy: ", test_proxy_vaild(proxy))
        if test_proxy_vaild(proxy):
            # print('[+]' + proxy + "可用")
            print(Fore.GREEN + '成功获取到代理', proxy)
            self.redis.add(proxy)

    def run(self):
        print("[-] 代理池获取器开始执行......")
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                # 获取代理
                proxies = self.crawler.get_proxies(callback)
                # print("proxies: ", proxies)
                # 刷新输出
                sys.stdout.flush()
                with ThreadPoolExecutor(ThreadCount) as pool:
                    pool.map(self.test_proxy_add, proxies)
Ejemplo n.º 9
0
def proxy():
    redis_cli = RedisClient()
    ip = redis_cli.random()
    if ip:
        res = ip + ':' + str(config.PORT)
        return Response(response=res, status=200)
    else:
        return Response(response='代理池为空', status=400)
Ejemplo n.º 10
0
 def __init__(self, website='default'):
     """
     父类,初始化一些对象
     :param website: 名称
     """
     self.website = website
     self.cookies_db = RedisClient('cookies', self.website)
     self.accounts_db = RedisClient('accounts', self.website)
Ejemplo n.º 11
0
def get_cookies_from_db(website):
    """
    提供此方法可以直接从数据库获取随机cookies
    :param website: 网站,全小写
    :return: 该网站的随机cookies,str类型,需要转换成Dict或CookieJar对象才能使用
    """
    acc = RedisClient('cookies', website)
    return acc.random()
Ejemplo n.º 12
0
 def _redis_init(self):
     try:
         if hasattr(self, 'redis') and self.redis:
             self.redis.close()
         self.redis = RedisClient()
     except redis.ConnectionError:
         self.redis = RedisClient()
         logger.warning("redis ConnectionError")
Ejemplo n.º 13
0
 def __init__(self):
     super(nanjingLandDetailSpider, self).__init__()
     dispatcher.connect(self.CloseSpider, signals.spider_closed)
     self.redisClient = RedisClient('nanjing', 'LandDetail')
     self.duplicateUrl = 0
     self.targetUrl = 'https://jy.landnj.cn/default.aspx?page={}'
     self.header = {'User-Agent': random.choice(agent_list)}
     self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
Ejemplo n.º 14
0
 def __init__(self):
     self.redis = RedisClient()
     self.real_ip = ''
     # 每次验证不成功,减去的分值
     self.minus_every_time = (INITIAL_SCORE -
                              DISCARD_SCORE) // VALIDATE_TIME
     self.key = PROXY_ORIGINAL
     self.anon_check_url = 'http://httpbin.org/ip'
Ejemplo n.º 15
0
 def remove_proxy(self):
     """
     移除代理
     :return: None
     """
     self.redis = RedisClient()
     self.redis.remove(CLIENT_NAME)
     print('Successfully Removed Proxy')
Ejemplo n.º 16
0
 def __init__(self):
     super(shanxiTransformNoticeSpider, self).__init__()
     dispatcher.connect(self.CloseSpider, signals.spider_closed)
     # TODO
     self.redisClient = RedisClient('shanxi', 'shanxiTransformResult')
     self.duplicateUrl = 0
     self.targetUrl = 'http://zrzyt.shanxi.gov.cn/zwgk/zwgkjbml/tdgl_836/crjg/index_{}.shtml'
     self.header = {'User-Agent': random.choice(agent_list)}
     self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
Ejemplo n.º 17
0
 def __init__(self):
     self.server = RedisClient()
     # 区别对待使用密码和不使用密码的配置模板
     if settings.USE_PASSWORD:
         self.peer_conf = "cache_peer %s parent %s 0 weighted-round-robin weight=2\n"
         # self.peer_conf = "cache_peer %s parent %s 0 no-query proxy-only login={}:{} never_direct allow all round-robin weight=1 connect-fail-limit=2 allow-miss max-conn=5\n".format(
         #     settings.USERNAME, settings.PASSWORD)
     else:
         self.peer_conf = "cache_peer %s parent %s 0 weighted-round-robin weight=2\n"
Ejemplo n.º 18
0
 def save_accounts():
     """
     录入账号和密码
     :return:
     """
     for website, accounts in ACCOUNTS.items():
         acc = RedisClient('accounts', website)
         acc.set_many(accounts)
         print('%s的所有账号已保存成功...' % website)
Ejemplo n.º 19
0
 def set_proxy(self, proxy):
     """
     设置代理
     :param proxy: 代理
     :return: None
     """
     self.redis = RedisClient()
     if self.redis.set(CLIENT_NAME, proxy):
         print('Successfully Set Proxy', proxy)
Ejemplo n.º 20
0
def record(key):
    if key in config.KEYS:
        ip = request.remote_addr
        print(ip)
        redis_cli = RedisClient()
        redis_cli.put(key, ip)
        return 'Successfully saved: {}'.format(ip)
    else:
        return 'Invalid Key'
Ejemplo n.º 21
0
 def set_proxy(self, proxy):
     """
     设置代理
     :param proxy: 代理
     :return: None
     """
     self.redis = RedisClient()
     if self.redis.set(self.CLIENT_NAME, proxy):
         logger.info(f'Successfully set proxy {proxy}')
Ejemplo n.º 22
0
 def __init__(self):
     super(longyanTransformNoticeSpider, self).__init__()
     dispatcher.connect(self.CloseSpider, signals.spider_closed)
     # TODO
     self.redisClient = RedisClient('longyan', 'longyanTransformNotice')
     self.duplicateUrl = 0
     # TODO
     self.targetUrl = 'https://www.lyggzy.com.cn/lyztb/tdky/084002/?pageing={}'
     self.header = {'User-Agent': random.choice(agent_list)}
     self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
Ejemplo n.º 23
0
def main():
    s = Scheduler()
    print('程序开始运行。。')
    redisClient = RedisClient()
    # flag = True
    # while flag:
    redis_len = redisClient.llen('employment')
    print('redis队列长度:' + str(redis_len))
    if redis_len >= 0:
        s.run()
Ejemplo n.º 24
0
 def __init__(self, website='default'):
     """
     父类,初始化一些对象
     :param website: 名称
     :param browser: 浏览器,不用可以设置为None
     """
     self.website = website
     self.cookies_db = RedisClient('cookies', self.website)
     self.accounts_db = RedisClient('accounts', self.website)
     self.init_browser()
Ejemplo n.º 25
0
 def __init__(self):
     super(zhengzhouLandTransformNoticeSpider, self).__init__()
     dispatcher.connect(self.CloseSpider, signals.spider_closed)
     # TODO
     self.redisClient = RedisClient('zhengzhou',
                                    'zhengzhouLandTransformNotice')
     self.duplicateUrl = 0
     self.targetUrl = 'http://zzland.zhengzhou.gov.cn/xycrgg/index_{}.jhtml'
     self.header = {'User-Agent': random.choice(agent_list)}
     self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
Ejemplo n.º 26
0
 def __init__(self):
     super(hefeiLandSupplySpider, self).__init__()
     dispatcher.connect(self.CloseSpider, signals.spider_closed)
     # TODO
     self.redisClient = RedisClient('hefei', 'hefeiLandSupply')
     self.duplicateUrl = 0
     # TODO
     self.targetUrl = 'http://ggzy.hefei.gov.cn/hftd/tdgy/?Paging={}'
     self.header = {'User-Agent': random.choice(agent_list)}
     self.reStr = '()\w\.:: 。 \(\)〔〕㎡㎡≤;,≥《》\-\/\%,、\.﹪㎡'
Ejemplo n.º 27
0
class Checker(object):
    def __init__(self):
        self.db = RedisClient()
        self.counts = defaultdict(int)

    def check(self, proxy):
        """
        测试代理,返回测试结果
        :param proxy: 代理
        :return: 测试结果
        """
        try:
            response = requests.get(settings.TEST_URL,
                                    proxies={
                                        'http': 'http://' + proxy,
                                        'https': 'https://' + proxy
                                    },
                                    timeout=settings.TEST_TIMEOUT)
            logger.debug(f'Using {proxy} to test {settings.TEST_URL}...')
            if response.status_code == 200:
                return True
        except (ConnectionError, ReadTimeout):
            return False

    def run(self):
        """
        测试一轮
        :return:
        """
        proxies = self.db.all()
        logger.info(f'Try to get all proxies {proxies}')
        for name, proxy in proxies.items():
            # 检测无效
            if not self.check(proxy):
                logger.info(f'Proxy {proxy} invalid')
                self.counts[proxy] += 1
            else:
                logger.info(f'Proxy {proxy} valid')
            count = self.counts.get(proxy) or 0
            logger.debug(
                f'Count {count}, TEST_MAX_ERROR_COUNT {settings.TEST_MAX_ERROR_COUNT}'
            )
            if count >= settings.TEST_MAX_ERROR_COUNT:
                self.db.remove(name)

    def loop(self):
        """
        循环测试
        :return:
        """
        while True:
            logger.info('Check for infinite')
            self.run()
            logger.info(f'Tested, sleeping for {settings.TEST_CYCLE}s...')
            time.sleep(settings.TEST_CYCLE)
Ejemplo n.º 28
0
def delete_account(site, type, username):
    conn = RedisClient(type, site)
    num = 0
    while num < 5:
        result = conn.delete(username)
        if result:
            print('删除 {} 成功! '.format(username))
            break
        num += 1
        time.sleep(1)
    print('删除失败, 请手动删除! ')
Ejemplo n.º 29
0
class Getter(object):
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def run(self):
        print("开始抓取代理ip")

        ips_list = self.crawler.run()
        for ip in ips_list:
            self.redis.add(ip)
Ejemplo n.º 30
0
def set_account(site, type, account, sep=' '):
    conn = RedisClient(type, site)
    username, value = account.split(sep)
    num = 0
    while num < 5:
        result = conn.set(username, value)
        if result:
            print('{}--{} 录入成功! '.format(username, value))
            return
        num += 1
        time.sleep(1)
    print('录入失败, 请检查 redis 内存是否已满, 尝试手动录入! ')
Ejemplo n.º 31
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        print('获取器开始执行')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)
Ejemplo n.º 32
0
 def __init__(self,font_key,url=None,imgSize=(0,0),imgMode='RGB',bg_color=(0,0,0),fg_color=(255,255,255),fontsize=30):
     self.imgSize = imgSize
     self.imgMode = imgMode
     self.fontsize = fontsize
     self.bg_color = bg_color
     self.fg_color = fg_color
     self.font_key = font_key
     self.url = url or self.make_url
     self.get_ttl()
     self.client = AipClient(APP_ID, API_KEY, SECRET_KEY,REDIS_URL)
     self.r = RedisClient(REDIS_URL)
Ejemplo n.º 33
0
 def verify_cookie(cls):
     baseurl = 'https://weibo.cn/'
     conn = RedisClient()
     if conn.get():
         #print(conn.get())
         try:
             response = requests.get(baseurl,cookies=conn.get())
             #print(response.text)                
             if response.status_code == 200:
                 return cls(cookie=conn.get())
             else:
                 conn.add_score(conn.get())
                 return cls(cookie=Spider.verify_cookie())
         except Exception:
             print('verify error')
     else:
         l = Login()
         l.save_cookies()
         return cls(cookie=Spider.verify_cookie())
Ejemplo n.º 34
0
 def save_cookies(self):
     self.login()
     conn = RedisClient()
     conn.add(self._cookie)
Ejemplo n.º 35
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Ejemplo n.º 36
0
 def __init__(self, appid, api_key, secrrt_key, redis_url):
     self.appid = appid
     self.api_key = api_key
     self.secrrt_key = secrrt_key
     self.client = AipOcr(appid, api_key, secrrt_key)
     self.redis = RedisClient(redis_url)
Ejemplo n.º 37
0
class TycTTF():
    _instance = {}
    def __init__(self,font_key,url=None,imgSize=(0,0),imgMode='RGB',bg_color=(0,0,0),fg_color=(255,255,255),fontsize=30):
        self.imgSize = imgSize
        self.imgMode = imgMode
        self.fontsize = fontsize
        self.bg_color = bg_color
        self.fg_color = fg_color
        self.font_key = font_key
        self.url = url or self.make_url
        self.get_ttl()
        self.client = AipClient(APP_ID, API_KEY, SECRET_KEY,REDIS_URL)
        self.r = RedisClient(REDIS_URL)

    def __new__(cls, url, *args, **kw):
        '''
        伪单例模式 缓存优化
        '''
        if url not in cls._instance:
            cls._instance[url] = super().__new__(cls)
        return cls._instance[url]

    @property
    def make_url(self):
        return 'https://static.tianyancha.com/fonts-styles/fonts/%s/%s/tyc-num.woff' % (self.font_key[:2],self.font_key)

    def get_ttl(self):
        res = requests.get(self.url)
        # PIL 字体对象
        self.font = ImageFont.truetype(BytesIO(res.content),self.fontsize)
        # ttf字体对象
        self.ttf = TTFont(BytesIO(res.content))
        # 反向解析 获取字体库所有文字 
        self.strings = {hex(string).replace('0x','\\u').encode('utf-8').decode('unicode-escape') if string > 2**8 else hex(string).replace('0x','\\x').encode('utf-8').decode('unicode-escape') for string in self.ttf.getBestCmap().keys() }

    def GenLetterImage(self,letters:str):
        self.letters = letters
        (self.letterWidth,self.letterHeight) = self.font.getsize(letters)
        if self.imgSize==(0,0):
            # 文字大小基础上 长宽各加10个像素点
            self.imgSize=(self.letterWidth+10,self.letterHeight+10)
        self.imgWidth,self.imgHeight=self.imgSize
        # new一个image对象  
        self.img = Image.new(self.imgMode, self.imgSize, self.bg_color)
        # 画笔对象
        self.drawBrush = ImageDraw.Draw(self.img)
        textY0 = (self.imgHeight-self.letterHeight+1)/2
        textY0 = int(textY0)
        textX0 = int((self.imgWidth-self.letterWidth+1)/2)
        # 从font对象内获取 letter 映射 文字  并写入空白image对象内
        self.drawBrush.text((textX0,textY0), self.letters, fill=self.fg_color,font=self.font)

    def _orc(self, word:str):
        # image = pretreat_image(self.img)
        self.GenLetterImage(word)
        # 实例化image容器
        img = ImageBytes()
        # 将img bytes 传给image容器
        self.img.save(img, 'JPEG')
        if word in {'0','1','2','3','4','5','6','7','8','9','x'}:
            # 数字 用eng 解析
            kwarg = {'language_type':'ENG'}
        else:
            # 其他使用中英文
            kwarg = {'language_type':'CHN_ENG'}
        return self.client.run(img.img,self.font_key,word,**kwarg)

    def orc(self,word:str):
        if self.r.hexists(self.url, word):
            return self.r.hget(self.font_key, word).decode('utf-8')
        else:
            return self._orc(word)

    def run(self, word:str):
        string = ''
        for letter in word:
            if letter in self.strings:
                string += self.orc(letter)
            else:
                string += letter
        return string
Ejemplo n.º 38
0
class AipClient(object):
    '''
    百度识别api
    '''
    def __init__(self, appid, api_key, secrrt_key, redis_url):
        self.appid = appid
        self.api_key = api_key
        self.secrrt_key = secrrt_key
        self.client = AipOcr(appid, api_key, secrrt_key)
        self.redis = RedisClient(redis_url)

    def __new__(cls, *args, **kw):
        '''
        api 单例模式
        '''
        if not hasattr(cls, '_instance'):
            cls._instance = super().__new__(cls)
        return cls._instance


    @property
    def options(self):
        return {"language_type":"CHN_ENG",
        "detect_direction":"false",
        "detect_language":"false",
        "probability":"false"}


    def General(self, image,**kwargs):
        print('调取General_api  识别')
        return self.client.basicGeneral(image, self.options)

    def Accurate(self, image):
        print('调取Accurate_api  识别')
        return self.client.basicAccurate(image, self.options)

    def orc(self, image, font_key, word, **kwargs):
        hash_value = MD5.md5(image)
        results = self.General(image, **kwargs)
        if results.get('words_result'):
            if results.get('words_result') != '*':
                result = results['words_result'][0]['words']
                self.redis.add(hash_value, result)
                self.redis.hadd(font_key, word, result)
            return result
        results = self.Accurate(image)
        if results.get('words_result'):
            if results.get('words_result') != '*':
                result = results['words_result'][0]['words']
                self.redis.add(hash_value, result)
                self.redis.hadd(font_key, word, result)
            return result
        # Image.open(BytesIO(image)).show()
        # print(hash_value)
        return '*'

    def run(self, image, font_key,word, **kwargs):
        hash_value = MD5.md5(image)
        if self.redis.exists(hash_value):
            result = self.redis.get(hash_value)
            self.redis.hadd(font_key, word, result)
            return result
        else:
            return self.orc(image, font_key, word, **kwargs)