Ejemplo n.º 1
0
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """判断是否达到了代理池限制"""
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run_all(self):
        print(' 获取器开始执行 ')
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)

    def run_specific(self, callback):
        print(' 获取器开始执行 ')
        if not self.is_over_threshold():
            proxies = self.crawler.get_proxies(callback)
            for proxy in proxies:
                self.redis.add(proxy)
Ejemplo n.º 2
0
class Getter():
    def __init__(self):
        self.db = RedisClient()
        self.crawler = Crawler()

    def is_over_threshold(self):
        """
        是否超出代理池限制
        """
        if self.db.count() > MAX_POOL_COUNT:
            return True
        else:
            return False

    def run(self):
        """
        :return:
        """
        print('start to get proxy')
        if not self.is_over_threshold():
            for item in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[item]
                proxies = self.crawler.get_proxy(callback)
                for proxy in proxies:
                    self.db.add(proxy)
Ejemplo n.º 3
0
 def schedule_getter(self, cycle=GETTER_CYCLE):
     """定时获取代理"""
     getter = Getter()
     db = RedisClient()
     while True:
         print(' 开始抓取代理 ')
         getter.run_specific('crawl_xdaili')
         db.clear()
         time.sleep(cycle)
Ejemplo n.º 4
0
 def __init__(self, test_url=url):
     self.test_url = test_url
     self.db = RedisClient()
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
         'Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63',
         'Origin':
         'https://www.bilibili.com',
         'accept-encoding':
         'gzip, deflate, br',
         'accept-language':
         'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
     }
Ejemplo n.º 5
0
def start():
    '''非协程启动爬虫'''
    rds = RedisClient('url', '127.0.0.1', None)
    my = MysqlClient()
    ip_pv = GetIpPv(rds, my)
    while ip_pv.get_num():

        domain = ip_pv.get_domain()
        print(ip_pv.get_result(domain))
Ejemplo n.º 6
0
def start_coro():
    '''非协程启动爬虫'''
    rds = RedisClient('url', '127.0.0.1', None)
    my = MysqlClient()
    ip_pv = GetIpPv(rds, my)
    event_loop = asyncio.get_event_loop()
    try:
        event_loop.run_until_complete(ip_pv.download())
    finally:
        event_loop.close()
Ejemplo n.º 7
0
 def run(self):
     accounts = RedisClient('accounts', self.website)
     cookies = RedisClient('cookies', self.website)
     accounts_usernames = accounts.all_keys()
     cookies_usernames = cookies.all_keys()
     for username in accounts_usernames:
         if username not in cookies_usernames:
             password = accounts.get(username)
             print('正在生成Cookies, 账号:{} 密码: {}'.format(username, password))
             new = BilibiliCookies(username, password)
             new_cookies = new.cookie()
             if new_cookies:
                 print('生成Cookies成功')
             else:
                 print('生成Cookies失败')
             cookies.set(username, new_cookies)
Ejemplo n.º 8
0
def init_module():
    global detector
    global booter
    global redis_client
    redis_client = RedisClient(host=REDIS_HOST,
                               port=REDIS_PORT,
                               password=REDIS_PASSWORD,
                               s_key=REDIS_KEY,
                               num=GET_NUM)
    xc = XiCiProxyHelper(quantity=QUANTITY, threshold=THRESHOLD)
    detector = Detector(redis_client, test_url=TEST_URL)
    booter = Booter(redis_client, xc, capacity=CAPACITY)
Ejemplo n.º 9
0
class ValidTester(object):
    def __init__(self, website='default'):
        self.website = website
        self.cookies_db = RedisClient('cookies', self.website)
        self.accounts_db = RedisClient('accounts', self.website)

    def test(self, username, cookies):
        raise NotImplementedError

    def run(self):
        cookies_groups = self.cookies_db.all()
        for username, cookies in cookies_groups.item():
            self.test(username, cookies)
Ejemplo n.º 10
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy: 单个代理
        :return: None
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print(' 正在测试 ', proxy)
                async with session.get(TEST_URL,
                                       proxy=real_proxy,
                                       headers=HEADERS,
                                       timeout=15) as response:
                    code = response.json().get('code')
                    print(response.json())
                    if response.status in VALID_STATUS_CODES and code in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print(' 代理可用 ', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print(' 请求响应码不合法, 不可用 ', proxy)
            except (ClientError, ClientConnectorError, TimeoutError,
                    AttributeError):
                self.redis.decrease(proxy)
                print(' 代理请求失败 ', proxy)

    def run(self):
        """
        测试主函数
        :return: None
        """
        print(' 测试器开始运行 ')
        try:
            proxies = self.redis.all()
            loop = asyncio.get_event_loop()
            # 批量测试
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
                tasks = [
                    self.test_single_proxy(proxy) for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print(' 测试器发生错误 ', e.args)
Ejemplo n.º 11
0
class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        测试单个代理
        :param proxy:代理
        :return: None
        """
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy,
                                       timeout=15) as resonse:
                    if resonse.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用', proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法', proxy)
            except (ClientError, aiohttp.ClientProxyConnectionError,
                    asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return: None
        """
        print('检测器开始运行')
        try:
            proxies = self.redis.all()
            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
            loop = asyncio.get_event_loop()
            #批量测试
            for i in range(0, len(proxies), BATCH_TEST_SIZE):
                test_proxies = proxies[i:i + BATCH_TEST_SIZE]
                tasks = [
                    asyncio.ensure_future(self.test_single_proxy(proxy))
                    for proxy in test_proxies
                ]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print("检测器发生错误", e)
Ejemplo n.º 12
0
class Tester():
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self,proxy):
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy,bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://'+proxy
                print('正在测试',real_proxy)
                async with session.get(TEST_API,proxy=real_proxy,timeout=15,allow_redirects=False) as response:
                    if response.status in VALID_STATUS_CODE:
                        self.redis.enable(proxy)
                        print('代理 ',proxy,' 可用')
                    else:
                        self.redis.decrease(proxy)
                        print('代理 ',proxy,' 请求失败')
            except:
                self.redis.decrease(proxy)
                print('代理 ' ,proxy, ' 不可用')
    def run(self):
        print('开始测试')
        try:
            for i in range(0,self.redis.count(),BATCH_TEST_COUNT):
                start = i
                end = min(i+BATCH_TEST_COUNT,self.redis.count())
                proxies = self.redis.batch(start,end)
                print('正在测试第 ',start,'到',end,'个代理')
                tasks = [self.test_single_proxy(proxy) for proxy in proxies]
                loop = asyncio.get_event_loop()
                loop.run_until_complete(asyncio.wait(tasks))
                sys.stdout.flush()
                time.sleep(5)
        except Exception as e:
            print('测试错误 ', e.args)
Ejemplo n.º 13
0
class Tester():
    def __init__(self, test_url=url):
        self.test_url = test_url
        self.db = RedisClient()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63',
            'Origin':
            'https://www.bilibili.com',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
        }

    def test_single_proxy(self, proxy):
        proxies = {'http': 'http://' + proxy, 'https': 'https://' + proxy}
        try:
            response = requests.get(self.test_url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=5)
            json = response.json()
            code = json.get('data')
            if code != None:
                self.db.max(proxy)
                print(' 代理可用 ', proxy)
            else:
                print(' 请求响应码不合法, 不可用 ', proxy)
                self.db.decrease(proxy)
        except:
            print(' 代理请求失败 ', proxy)
            self.db.decrease(proxy)

    def run(self):
        print(' 测试器开始运行 ')
        proxies = self.db.all()
        for proxy in proxies:
            print(' 正在测试 ', proxy)
            self.test_single_proxy(proxy)
Ejemplo n.º 14
0
        'Please input config file path(if you use default file type \'d\'.): ')
    if path == 'd':
        path = 'proxy.conf'
    sure = input(
        'Are you sure the config file in \'{}\'. [y/n]: '.format(path))
    if sure == 'y':
        break

cfg = ConfigParser()
cfg.read(path)
REDIS_HOST = try_to_get_options(cfg.get, 'redis', 'host')
REDIS_PORT = try_to_get_options(cfg.getint, 'redis', 'port')
REDIS_PASSWORD = try_to_get_options(cfg.get, 'redis', 'password')
REDIS_KEY = try_to_get_options(cfg.get, 'redis', 'key')
redis_client = RedisClient(host=REDIS_HOST,
                           port=REDIS_PORT,
                           password=REDIS_PASSWORD,
                           s_key=REDIS_KEY)
count = redis_client.count()
if count == 0:
    print('Already cleaning!')
else:
    redis_client.show()
    sure = input(
        'Are you sure remove that data? amount {} items! [y/n]: '.format(
            count))
    if sure == 'y':
        redis_client.remove_by_range(0, 100)
    else:
        print('Good luck! Bye Bye')
Ejemplo n.º 15
0
from crawl import XiCiProxyHelper
from storage import Booter, RedisClient
from detector import Detector

if __name__ == "__main__":
    xch = XiCiProxyHelper(quantity=40, threshold=1.000)
    rc = RedisClient()

    b = Booter(rc, xch)
    b.run()

    de = Detector(rc)

    de.run()
    print('一共有{}'.format(rc.count()))

    rc.show()

    rc.remove_by_range(0, 100)
Ejemplo n.º 16
0
    def get_domain(self):
        '''获取域名'''
        return self.redis_db.pop()

    def get_rest_domain_num(self):
        '''剩余域名数量'''
        return self.redis_db.get_num()

    def save(self, text):
        '''保存结果'''
        with open('title.txt', 'a+') as f:
            f.write(text)

    def download(self):
        while self.get_rest_domain_num():
            url = self.get_domain()
            logging.info('req ' + url)
            try:
                response = self.get_page(url)
                response.encoding = response.apparent_encoding
                logging.info(response.status_code)
                doc = self.parse(response)
                self.save(url + ';' + doc + '\n')
            except Exception as e:
                self.save(url + ';\n')


if __name__ == '__main__':
    tc = ThemeCrawler(RedisClient('url', '127.0.0.1', None), MysqlClient())
    tc.download()
Ejemplo n.º 17
0
 def __init__(self, website='default'):
     self.website = website
     self.cookies_db = RedisClient('cookies', self.website)
     self.accounts_db = RedisClient('accounts', self.website)
Ejemplo n.º 18
0
def conn():
    if not hasattr(g, 'conn'):
        g.conn = RedisClient()
        return g.conn
Ejemplo n.º 19
0
def get_coon():
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
    return g.redis
Ejemplo n.º 20
0
from storage import RedisClient

rc = RedisClient('url', '127.0.0.1', None)
with open('ul.txt', 'r') as f:
	n = [line.rstrip() for line in f]
# print(n)

for x in n:
	rc.add(x)
Ejemplo n.º 21
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Ejemplo n.º 22
0
 def __init__(self):
     self.redis = RedisClient()