Example #1
0
import aiohttp
import requests
import redis
import asyncio
from settings import *
from requests import Request
from lxml import etree
import pickle
from m_queue import TaskQueue
import random
from log import log
tt = TaskQueue()
class Aio_Req(object):
    def __init__(self,url,callback,meta = {}):
        self.url = url
        self.callback = callback
        self.meta = meta
        self.headers = {"User-Agent":random.choice(agents)}
    async def aio_req(self):
        async with aiohttp.ClientSession() as resp:
            try:
                async with resp.get(url=self.url,headers=self.headers) as resp:
                    page = await resp.text()
                    self.meta["response"] = page
                    log.log(str(resp.status) + "   " + self.url, "info")
                    self.callback(self.meta)
                    tt.old_task(self.url)
            except Exception as e:
                print(e)
                tt.add_task(Aio_Req(self.url,self.callback,meta=self.meta))
                log.log(self.url + "   " + str(e),"error")
Example #2
0
from req import Aio_Req, Sin_Req
from lxml import etree
import asyncio
from settings import *
import threading as th
from m_queue import TaskQueue
import requests
import json
from selenium import webdriver
from log import log
from urllib.parse import urljoin
from db import Mon
task_queue = TaskQueue()
mon = Mon()


class BIN_Spider(object):
    def __init__(self):
        self.base_link = "https://api.zhihu.com/people/{}"

    def start_req(self):
        detail_lst = set()
        after = set()
        for i in mon.data_find("follower"):
            detail_lst.add(i["id"])
        for d in mon.data_find("user_detail"):
            if "id " in d:
                after.add(d["id"])
        print(len(list(detail_lst - after)))

    def parse_detail(self, response):
Example #3
0
from m_queue import TaskQueue
from lxml import etree

tt = TaskQueue()
from req import Sin_Req
url = "https://sz.lianjia.com/ershoufang/pg2/"


def A(response):
    html = etree.HTML(response["response"])
    title = html.xpath('//title/text()')
    print(title)


rr = Sin_Req(url=url, callback=A)
import pickle
tt.add_task(rr)
a = tt.pop_task()
a.get()