async def get_proxy(self, timeout=60): headers = {'User-Agent': UserAgent.random()} async with aiohttp.ClientSession(headers=headers) as session: url = self.url.format(self.count) async with session.get(url, timeout=timeout) as r: content = await r.text() return re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', content)
async def get_proxy(self, timeout=60): headers = {'User-Agent': UserAgent.random()} proxies = [] async with aiohttp.ClientSession(headers=headers) as session: for url in self.urls: async with session.get(url, timeout=timeout) as r: content = await r.text() selector = html.fromstring(content) ul_list = selector.xpath('//ul[@class="l2"]') for ul in ul_list: ips = ul.xpath('.//li/text()')[0:2] proxy = ':'.join(map(lambda x: x.strip(' \t\n'), ips)) proxies.append(proxy) return proxies
async def get_proxy(self, timeout=60): headers = {'User-Agent': UserAgent.random()} proxies = [] async with aiohttp.ClientSession(headers=headers) as session: for page in range(1, 10): url = self.url.format(page=page) async with session.get(url, timeout=timeout) as r: content = await r.text() selector = html.fromstring(content) proxy_list = selector.xpath('//td[@class="ip"]') for each_proxy in proxy_list: ips = each_proxy.xpath('.//text()') proxy = ''.join(map(lambda x: x.strip(' \t\n'), ips)) proxies.append(proxy) return proxies
async def get_proxy(self, timeout=60): headers = { 'User-Agent': UserAgent.random() } async with aiohttp.ClientSession(headers=headers) as session: async with session.get(self.url, timeout=timeout) as r: content = await r.text() proxies = [] selector = html.fromstring(content) tr_list = selector.xpath('//tr')[1:] for tr in tr_list: ips = tr.xpath('./td/text()')[0:2] proxy = ':'.join(map(lambda x: x.strip(' \t\n'), ips)) proxies.append(proxy) return proxies
async def get_proxy(self, timeout=60): proxies = [] with aiohttp.ClientSession() as session: for url in self.urls: for i in range(1, self.total + 1): headers = {'User-Agent': UserAgent.random()} target = url.format(i) async with session.get(target, headers=headers, timeout=timeout) as r: content = await r.text() selector = html.fromstring(content) tr_list = selector.xpath('//tbody/tr') for tr in tr_list: ip = tr.xpath('.//td[@data-title="IP"]/text()') port = tr.xpath('.//td[@data-title="PORT"]/text()') proxies.append(':'.join([ip[0], port[0]])) await asyncio.sleep(3) return proxies
from utils.color import Colored import httpx from httpx import Response from typing import Dict, List, Set, Coroutine, Any, Union import time from core.queue import Task from utils import generate_token, create_folder, dumps_content, loads_content, is_file_exists from utils import decode_content from utils.proxy_utls import get_random_proxy from lxml import etree from lxml.etree import HTMLParser from bs4 import BeautifulSoup as bs from config.constant import ENVIRONMENT, EnvironmentType, GET, POST, TEXT, JSON, CSS from utils import UserAgent ua = UserAgent() @logme.log(name="Downloader") # TODO: to use https://www.python-httpx.org/ to download web page class Downloader(object): __slots__ = ['sleep_factor', 'session_container'] def __init__(self): super().__init__() self.sleep_factor = 2 self.session_container = dict() def close(self): self.session_container.clear()
def random_headers(): headers = {'User-Agent': UserAgent.random()} return headers