def __init__(self, settings=None, spidercls=None):
        if isinstance(settings, dict) or settings is None:
            settings = Setting(settings)
        self.settings = settings

        self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS'])
        self.lfm = self.lfs.from_settings(self.settings)

        logger.info(*self.lfm.crawled("CrawlerRunner", '', '已初始化'))

        self.spider_loder = []
        # 装载的是Crawler的集合
        self._crawlers = set()
        # 装载的是defer的集合
        self._active = set()
        # 子爬虫的数量
        self.MAX_CHILD_NUM = 9
        if not spidercls:
            # 子爬虫的类
            self.spidercls = spidercls
        else:
            # 从设置中导入子爬虫的类
            self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS'])

        #  task完成标志位
        self.task_finish = False
        self.slot = None

        self.running = False
        self._task_schedule = queue.Queue()
Ejemplo n.º 2
0
    def __init__(self,tasks,settings=None,spidercls=None,name=None,logformat=None,middlewares=None):
        if isinstance(settings, dict) or settings is None:
            settings = Setting(settings)
        self.settings = settings
        self.middlewares = middlewares
        if logformat:
            self.lfm = logformat
        else:
            self.lfs = load_object(self.settings['LOG_FORMATTER_CLASS'])
            self.lfm = self.lfs.from_settings(self.settings)
        self.name = name if name else ''

        logger.info(*self.lfm.crawled(
            "CrawlerRunner", self.name,
            '已初始化')
                     )
        if isinstance(tasks,dict):
            self.tasks = iter_dict(tasks)
        else:
            self.tasks = tasks
        self.spider_loder = []
        # 装载的是Crawler的集合
        self._crawlers = set()
        # 装载的是defer的集合
        self._active = set()
        self._active_finish = False
        # 子爬虫的数量
        if self.name:
            self.MAX_CHILD_NUM = 4
        else:
            self.MAX_CHILD_NUM = 4
        # 子爬虫的名称
        # self.SPIDER_NAME_CHOICE = self.settings['SPIDER_NAME_CHOICE']
        self.SPIDER_NAME_CHOICE = False

        # 缓冲的地址最大数量
        # self.MAX_SCHEDULE_NUM = 10
        if spidercls:
            # 子爬虫的类
            self.spidercls = spidercls
        else :
            # 从设置中导入子爬虫的类
            self.spidercls = load_object(self.settings['SPIDER_CHILD_CLASS'])

        #  task完成标志位
        # self.filter_task = FilterTask(self.SPIDER_NAME_CHOICE)
        # 导入队列中任务个数的最大值
        self.filter_task = 10
        self._push_task_finish = False
        self._pull_task_finish = False
        self._next_task = None
        self.fifer = FilterTask(settings)

        self.slot = None
        self._closewait = None

        self.running = False
        self._pause = False
        self._task_schedule = queue.Queue()
Ejemplo n.º 3
0
 def __init__(self, settings=None):
     if isinstance(settings, dict) or settings is None:
         settings = Setting(settings)
     self.settings = settings
     logger.debug(type(self.settings))
     self.spider_loder = _get_spider_loader(settings)
     #装载的是Crawler的集合
     self._crawlers = set()
     #装载的是defer的集合
     self._active = set()
Ejemplo n.º 4
0
 def __init__(self, tasks, settings=None):
     if isinstance(settings, dict) or settings is None:
         settings = Setting(settings)
     self.settings = settings
     self.spider_loder = []
     # 装载的是Crawler的集合
     self._crawlers = set()
     # 装载的是defer的集合
     self._active = set()
     # 子爬虫的数量
     self._childNum = 3
     self.task_schedule = tasks
     self.task_finish = False
     self.slot = None
     self.running = False
     self.searchRes = tasks
     self.child_task = queue.Queue()
Ejemplo n.º 5
0
import logging
import pprint

from test.framework.log.log import LogFormat
from test.framework.setting import Setting
from test.framework.log.logfilter import ErrorFilter
s = Setting()
a = LogFormat(s)

logger = logging.getLogger("lala")  # 生成一个log实例

logger.debug(*a.crawled("Spider","lala",
                       '出现出现错误',
                       {'function':'Scraper','request':"baba",'time':333.333}),
            extra={'extra_info':'error'})


logger.info(*a.crawled("Spider",'works',
                            'info',
                       {
                           'time':6.7777777,
                            'function':"engine"}),
            extra={
                "extra_info":" inprogress中还剩下{:d}个任务".format(3)})


logger.info(*a.crawled("Spider","lala",
                                    '出现出现错误',
                        {
                            'function':'Scraper',
                            'request':"baba"
Ejemplo n.º 6
0
def request_errback(content):
    print("request_and_response errback")
    print(content)
    return content


def agent_print(content):
    print("agent_print")
    print(type(content))
    print(content)


request = Request(url=url,
                  callback=request_callback,
                  method='get',
                  headers=headers,
                  errback=request_errback,
                  meta={"download_timeout": 2})

settings = Setting()

spider = Spider1.update_settings(settings)

httphandler = HTTPDownloadHandler(settings)
agent = httphandler.download_request(request, spider)
agent.addCallback(agent_print)
agent.addErrback(request_errback)
agent.addBoth(lambda _: reactor.stop())

reactor.run()
Ejemplo n.º 7
0
from test.framework.core.crawler import _get_spider_loader
from test.framework.setting import Setting, overridden_or_new_settings
import logging

logging.basicConfig(level=logging.DEBUG)

s = Setting()

_active = set()
new_s = s.copy()
cls = _get_spider_loader(new_s)

new_and_over_setting = ["TEST2 = t2", "TEST3 = t3"]

temp = [x.split('=', 1) for x in new_and_over_setting]

tm = dict(temp)

for k, v in tm.items():
    new_s.set(k.strip(), v, "project")

d = dict(overridden_or_new_settings(new_s))

logging.info("Overridden settings:\n %(settings)r", {'settings': d})
'''



for name, module in cls._spiders.items():
    module.update_settings(new_s)
    overridden_or_new_settings(new_s)