class HorrorHandler(Handler): family = "MovieItem" logger = get_logger("horrorlog") async def handle_after(self, item): if item["genres"] and "Horror" in item["genres"]: self.logger.warning(f"({item['title']}) is a horror movie!!!!")
class ProxyLogRedis(Handler): family = 'Request' logger = get_logger('validator') async def on_start(self): self.keys = self.crawler.keys # get redis connection from crawler self.redis: _Redis = self.crawler.redis async def handle_after(self, req: Request): if req.exceptions or not (await self.crawler.is_ok(req.response)): await self.update_proxy_to_redis(req.meta['proxy'], False, old_score=req.meta['old_score']) else: await self.update_proxy_to_redis( req.meta['proxy'], True, req.meta['speed'], int(time.time()), req.meta['old_score']) async def update_proxy_to_redis(self, proxy: str, success: bool, speed: int = None, last: int = None, old_score: int = None): """Update infos in three redis sorted sets.""" if success: tr = self.redis.multi_exec() tr.zadd(self.keys['speed'], speed, proxy) tr.zadd(self.keys['last'], last, proxy) if old_score is None or old_score < 5: tr.zincrby(self.keys['score'], 1, proxy) else: tr.zincrby(self.keys['score'], round(5 / old_score, 2), proxy) await tr.execute() # self.logger.info('{} speed:{}'.format(proxy, speed)) else: if old_score and old_score <= -4: await self.delete_proxy(proxy) # self.logger.info('delete proxy:{}'.format(proxy)) else: await self.redis.zincrby(self.keys['score'], -1, proxy) # self.logger.info('proxy failed:{}'.format(proxy)) async def delete_proxy(self, proxy: str): tr = self.redis.multi_exec() tr.zrem(self.keys['speed'], proxy) tr.zrem(self.keys['last'], proxy) tr.zrem(self.keys['score'], proxy) tr.srem(self.keys['init'], proxy) await tr.execute()
from acrawler import Handler, get_logger from prometheus_client import start_http_server, Histogram, Counter, Gauge import asyncio logger = get_logger("prometheus") class PromExporter(Handler): family = "Task" priority = 100 async def on_start(self): self.name = self.crawler.name self.port = self.crawler.config.get("PROMETHEUS_PORT", 8000) self.addr = self.crawler.config.get("PROMETHEUS_ADDR", "localhost") self.interval = self.crawler.config.get("PROMETHEUS_INTERVAL", 1) self.reqs = Gauge(f"{self.name}_requests_progress", "Number of working requests") self.reqs_q = Gauge(f"{self.name}_requests_queue", "Number of requests in ready queue") self.reqs_w = Gauge(f"{self.name}_requests_waiting", "Number of requests in waiting queue") self.counts = {} self.crawler.create_task(self.start_server()) self.crawler.create_task(self.export()) async def start_server(self): logger.info(
import re import time from acrawler import Crawler, ParselItem, Parser, Request, get_logger, Item from aproxy.rules import COMMON_TASKS, TEST_TASKS from aproxy.task import ProxyGen, ProxyItemForWeb, ProxyParseItem import asyncio import sys import os logger = get_logger('aproxy') class ProxyCrawler(Crawler): config = { 'DOWNLOAD_DELAY': 3, 'MAX_REQUESTS_PER_HOST': 1, 'MAX_REQUESTS': 12, 'REDIS_ENABLE': True, 'WEB_ENABLE': True, # 'LOG_TO_FILE': 'proxycrawler.log' } middleware_config = { 'aproxy.handlers.ToRedisInit': 500, 'aproxy.handlers.WebQuery': 2000, 'acrawler.handlers.RequestPrepareBrowser': 1000, } parsers = [ Parser(css_divider='table tr', item_type=ProxyParseItem),
import json from pathlib import Path import cloudscraper from yarl import URL from acrawler import ReScheduleImmediatelyError, get_logger from acrawler.handlers import ExpiredWatcher logger = get_logger("cfscrape") class CfscrapeHandler(ExpiredWatcher): """Bypass the cloudflare. """ family = "Request" priority = 500 ttl = 20 async def custom_on_start(self): """Load local token and update cookies if it is possible. """ self.p = Path( self.crawler.config.get("CFS_COOKIES_FILE", Path.home() / ".cfscookies")) self.proxies = self.crawler.config.get("CFS_PROXIES", None) self.url = URL(self.crawler.config.get("CFS_URL")) self.ua = self.crawler.config.get( "CFS_USERAGENT",
# Scrape quotes from http://quotes.toscrape.com/ from acrawler import get_logger from acrawler import Parser, Crawler, ParselItem, Request logger = get_logger("quotes") class QuoteItem(ParselItem): log = True default = {"type": "quote"} css = {"author": "small.author::text"} xpath = { "text": ['.//span[@class="text"]/text()', lambda s: s.strip("“")[:20]] } class AuthorItem(ParselItem): log = True default = {"type": "author"} css = { "name": "h3.author-title::text", "born": "span.author-born-date::text" } class QuoteCrawler(Crawler): main_page = r"quotes.toscrape.com/page/\d+" author_page = r"quotes.toscrape.com/author/.*" parsers = [ Parser(
from acrawler.http import BrowserRequest from acrawler import Crawler, get_logger logger = get_logger("pyclock") class ClockCrawler(Crawler): middleware_config = { # you should enable this handler to support BrowserRequest "acrawler.handlers.RequestPrepareBrowser": 800 } async def start_requests(self): yield BrowserRequest(url="https://pythonclock.org", page_callback=self.operate_page) async def operate_page(self, page, response): logger.info(await response.text()) logger.info(await page.text()) assert not "countdown-amount" in (await response.text()) assert "countdown-amount" in (await page.text()) await page.screenshot(show=True) if __name__ == "__main__": ClockCrawler().run()
import random import re from acrawler import Crawler, ParselItem, Parser, Request, get_logger logger = get_logger() PATTERN = re.compile( r"\b((?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9]))\D*([0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])" ) # Some websites only allow IP access from China WEBSITES = [ { "name": "xicidaili.com", "resource": ["http://www.xicidaili.com/nn/%s" % i for i in range(1, 6)] + ["http://www.xicidaili.com/wn/%s" % i for i in range(1, 6)] + ["http://www.xicidaili.com/wt/%s" % i for i in range(1, 6)], "enable": 1, }, { "name": "kuaidaili.com", "resource": ["https://www.kuaidaili.com/free/inha/%s" % i for i in range(1, 6)] + ["https://www.kuaidaili.com/proxylist/%s" % i for i in range(1, 11)], "enable": 1, },
import asyncio import json import time from acrawler import Crawler, Handler, Item, Request, Response, get_logger from aproxy.handlers import RequestSpeed import aioredis import random logger = get_logger('validator') class ValidatedItem(Item): def __init__(self, name, extra=None, **kwargs): super().__init__(**kwargs) self.content['name'] = name self.content['speed'] = None self.content['last'] = None self.content['score'] = 5 self.content.update(extra) class HTTPValidator(Crawler): middleware_config = { 'aproxy.handlers.RequestSpeed': 1000, 'aproxy.handlers.ProxyLogRedis': 800 } config = { 'REDIS_ENABLE': True, }