#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Created on : 2019-03-14 18:49 # @Author : zpy # @Software: PyCharm # todo 完善 from conf.config import mongo_storage from plogger import get_logger log = get_logger("storage") class BaseBackend(object): def __init__(self, app): self.app = app def pre_check(self): """ 检查传入的配置、参数 :return: """ pass class MongoBackend(BaseBackend): def save(self, data): # todo 这里应该传入数据还是? log.info("{} {} insert {}".format(str(self.app), self.app.group, len(data))) mongo_storage[str(self.app)][self.app.group].insert_many(data)
#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Created on : 2019-03-11 22:25 # @Author : zpy # @Software: PyCharm from plogger import get_logger from conf.config import redis_client from backend.storage import MongoBackend log = get_logger('core_task') # 1个sdk -> n个 source -> n个group # todo config 子task粒度的(celery配置) 具体一批任务粒度的(结果配置、备份配置等) class Task(object): """ 在 celery 上封装一层, 任务的调度,执行,分发都会依靠这里来做 """ app = None def __init__(self, **kwargs): # todo 接口设计 self.tasks = kwargs['tasks'] self.group = kwargs['group'] self.source = kwargs['source'] self.code = -1 log.info(('init', kwargs))
#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Created on : 2019-03-06 17:17 # @Author : zpy # @Software: PyCharm # 处理请求相关 import requests from requests_html import HTMLSession import random from conf.config import PROXY as proxy from plogger import get_logger log = get_logger('prequest') def get_proxy(): """ 返回一个代理 :return: """ return requests.get(proxy).json()['proxy'] def ua(): headers_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Created on : 2019-03-06 17:19 # @Author : zpy # @Software: PyCharm import abc from abc import ABCMeta from plogger import get_logger, func_time_logger from exceptions import SpiderException from requests.exceptions import Timeout, ConnectionError from collections import deque from spider.prequest import Msession log = get_logger('pspider') # todo v1 完成 # 请求解析初版 # 先不考虑链式请求 class Pspider(metaclass=ABCMeta): def __init__(self): self.result = {} self.session = None self.tasks = [] @abc.abstractmethod def task(self): pass
#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Created on : 2019-03-12 15:42 # @Author : zpy # @Software: PyCharm from celery import Celery from kombu import Queue, Exchange from conf.config import celery_broker from plogger import get_logger log = get_logger('celery_init') capp = Celery('app', broker=celery_broker) # 保证任务是可靠的执行了 capp.conf.update(CELERY_REJECT_ON_WORKER_LOST=True, CELERY_ACKS_LATE=True) def init_sdks(): from app.register import _all_sdk_ from app import tasks queues = [] for s in _all_sdk_: s.app = capp name = s.__str__() log.info("load %s", name) tasks.__dict__[name] = s.ptask(name, rate_limit='10/m') queues.append( Queue(name, exchange=Exchange(name, type='direct'),
#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Created on : 2019-03-12 17:02 # @Author : zpy # @Software: PyCharm from app.task import Task from example.testspider import LagouSpider from example.zhihuspider import BihuSpider from conf.config import redis_client, mongo_storage import time from plogger import get_logger log = get_logger('testsdks') class TestTask(Task): def start(self, **kwargs): print('instance start', kwargs) return 'test' class LagouTask(Task): def start(self): spider = LagouSpider() spider.tasks = self.tasks spider.start() for d in spider.result['job'].export_sql('test.test'): print(d)