import aiohttp import requests import redis import asyncio from settings import * from requests import Request from lxml import etree import pickle from m_queue import TaskQueue import random from log import log tt = TaskQueue() class Aio_Req(object): def __init__(self,url,callback,meta = {}): self.url = url self.callback = callback self.meta = meta self.headers = {"User-Agent":random.choice(agents)} async def aio_req(self): async with aiohttp.ClientSession() as resp: try: async with resp.get(url=self.url,headers=self.headers) as resp: page = await resp.text() self.meta["response"] = page log.log(str(resp.status) + " " + self.url, "info") self.callback(self.meta) tt.old_task(self.url) except Exception as e: print(e) tt.add_task(Aio_Req(self.url,self.callback,meta=self.meta)) log.log(self.url + " " + str(e),"error")
from req import Aio_Req, Sin_Req from lxml import etree import asyncio from settings import * import threading as th from m_queue import TaskQueue import requests import json from selenium import webdriver from log import log from urllib.parse import urljoin from db import Mon task_queue = TaskQueue() mon = Mon() class BIN_Spider(object): def __init__(self): self.base_link = "https://api.zhihu.com/people/{}" def start_req(self): detail_lst = set() after = set() for i in mon.data_find("follower"): detail_lst.add(i["id"]) for d in mon.data_find("user_detail"): if "id " in d: after.add(d["id"]) print(len(list(detail_lst - after))) def parse_detail(self, response):
from m_queue import TaskQueue from lxml import etree tt = TaskQueue() from req import Sin_Req url = "https://sz.lianjia.com/ershoufang/pg2/" def A(response): html = etree.HTML(response["response"]) title = html.xpath('//title/text()') print(title) rr = Sin_Req(url=url, callback=A) import pickle tt.add_task(rr) a = tt.pop_task() a.get()