Example #1
0
class TestProxyManager(TestCase):
    item = ProxyItem(ip='123')

    def test_count(self):
        print(ProxyManager().count())

    def test_available_count(self):
        print(ProxyManager().available_count())

    def test_get(self):
        proxy = ProxyManager().get()
        print(proxy)

    def test_remove(self):
        ProxyManager().remove(self.item)

    def test_success(self):
        ProxyManager().success(ProxyItem(self.item))

    def test_fail(self):
        ProxyManager().fail(ProxyItem(self.item))

    def test_banned(self):
        ProxyManager().banned(ProxyItem(self.item))

    def test_sql(self):
        pm = proxy_manager
        for k, v in pm.__dict__.items():
            if k.startswith('_sql'):
                print(f'{k} = {v}')
Example #2
0
 async def fetch(self, sql):
     record_list = await self.manager.fetch(sql)
     if record_list:
         print(f'读取到 ip {len(record_list)} 个')
         for record in record_list:
             item = ProxyItem(**record)
             self._proxy_queue.put(item)
Example #3
0
 def parse_proxy_from_element(element,
                              element_xpath_formatter=None,
                              element_xpath_dict=None):
     """从元素中解析代理"""
     proxy_item = ProxyItem()
     if element_xpath_formatter and element_xpath_dict:
         for k, v in element_xpath_dict.items():
             if '-' in v:
                 v, xpath = v.split('-')
             else:
                 xpath = element_xpath_formatter
             xpath = xpath % v
             proxy_item[k] = element.xpath(xpath)[0]
     return proxy_item
Example #4
0
    def parse_proxy_list(self, text):
        """解析代理"""
        # 将回车替换,方便正则查找
        text = text.replace('\n', '')
        all_match = re.findall(self.pattern, text)

        proxy_list = []
        if all_match:
            for match in all_match:
                ip, port, http_type = self.parse_proxy_from_match(match)
                if ip not in [proxy['ip'] for proxy in proxy_list]:
                    proxy_item = ProxyItem()
                    proxy_item['ip'] = ip
                    proxy_item['port'] = port
                    proxy_item['http_type'] = http_type
                    proxy_list.append(proxy_item)
        return proxy_list
Example #5
0
class ProxyPostgreSQLPipelineTest(BasePostgreSQLPipelineTest):
    pipeline = ProxyPostgreSQLPipeline()
    insert_item = ProxyItem(ip=1)
Example #6
0
class ProxyPostgreSQLPipeline(BasePostgreSQLPipeline):
    """保存代理"""
    item = ProxyItem()
Example #7
0
 def test_banned(self):
     ProxyManager().banned(ProxyItem(self.item))
Example #8
0
 def test_fail(self):
     ProxyManager().fail(ProxyItem(self.item))
Example #9
0
 def test_success(self):
     ProxyManager().success(ProxyItem(self.item))
Example #10
0
    def __init__(self):
        item = ProxyItem()
        self.manager = PostgreSQLManager(item)
        self._proxy_queue = queue.Queue()

        # 一些 sql
        table_name = item.get_table_name()
        available_condition = """
        WHERE available=1
        OR (available=2 AND EXTRACT(EPOCH from NOW()- INTERVAL'1 HOUR') > banned_time)
        """
        # 按使用次数排序,保存都能用到

        # 要让并发每次都能取到,如果每次请求 1s,则每隔 1s 就可能重复取到 ip
        # 乘以 10,可以让 10 轮请求后,才会重复
        limit = douyin_spider.CONCURRENT_REQUESTS * 10
        if limit > 200:
            limit = 200
        self._sql_fetch_available = item.generate_get_sql() + available_condition + f"""
        ORDER BY used_times LIMIT {limit}
        """
        """获取可用的"""

        self._sql_count = f"""
        SELECT COUNT(*) FROM {table_name}
        """
        """统计数量"""

        self._sql_available_count = f"""
        SELECT COUNT(*) FROM {table_name}
        {available_condition}
        """
        """统计有效数量"""

        primary_key = 'ip'
        self._sql_update = f"""
        UPDATE {table_name}
        %s
        WHERE {primary_key} = '{{{primary_key}}}'
        """
        """后面的 3 个 {} 先求中间值,外面 2 个{{}}表示 1 个{} 用于格式化"""

        self._sql_get_fail_times_by_ip = f"""
        SELECT fail_times FROM {table_name}
        WHERE {primary_key} = '{{{primary_key}}}'
        """
        """获取失败次数"""

        self._sql_add_times = self._sql_update % 'SET {key}={key}+1 ,update_time={now}'
        """添加次数"""

        self._sql_update_success = self._sql_update % 'SET success_times=success_times+1 ,' \
                                                      ' update_time={now} , available=1'
        """更新为成功"""

        self._sql_update_fail = self._sql_update % 'SET update_time={now} , available=0'
        """更新为失败"""

        self._sql_update_banned = self._sql_update % 'SET fail_times=fail_times+1 ,' \
                                                     ' banned_time={now} , update_time={now} , available=2'
        """更新为被禁"""

        # 初始化
        asyncio.get_event_loop().run_until_complete(self.manager.connect_database())
        asyncio.get_event_loop().run_until_complete(self.manager.create_table())
Example #11
0
 def test_insert(self):
     pipeline = ProxyPostgreSQLPipeline()
     pipeline.open_spider(None)
     pipeline.process_item(ProxyItem(ip=1), None)