def parse(self, response): TDDCLogging.debug('Download Success. ' + response.url) task,_ = response.request.meta.get('item') rsp_info = {'rsp': [response.url, response.status], 'content': response.body} if self.signals_callback: self.signals_callback(self, SingleSpider.SIGNAL_STORAGE, [task, rsp_info])
def _get_status(self): while True: cur_time = 1495087998 # time.time() keys = self.keys(MonitorSite.STATUS_HSET_PREFIX + '.*') for key in keys: h_len = self.hlen(key) platform, status = key.split('.')[-2:] if not self._status.get(platform): self._status[platform] = {} self._status[platform][status] = h_len item = self.hscan_iter(key) for index, (url, task) in enumerate(item): task = json.loads(task) task = Task(**task) time = task.timestamp if int(time) < cur_time - 20: MonitorQueues.EXCEPTION_TASK.put(task) TDDCLogging.debug( str(index) + ' : ' + task.platform + ' : ' + url + ' : ' + str(task.status) + ' : ' + str(time) + ' : ' + 'Crawl Again.') self.hdel(MonitorSite.STATUS_HSET_PREFIX, url) gevent.sleep(60) TDDCLogging.debug( json.dumps(self._status, sort_keys=True, indent=4))
def _task_status_update(self): while True: task = ParserQueues.TASK_STATUS.get() TDDCLogging.debug('[{}:{}:{}]'.format(task.platform, task.url, task.status)) self._successed_num += 1 self._successed_pre_min += 1
def add_task(self, task, is_retry=False, times=1): if not is_retry: TDDCLogging.debug('Add New Task: ' + task.url) headers = self._init_request_headers(task) req = (self._make_get_request(task, headers, times) if not task.method or upper(task.method) == 'GET' else self._make_post_request(task, headers, times)) self.crawler.engine.schedule(req, self)
def _generator(self): cookies_info = {} while True: TDDCLogging.debug('Generating.') cur_time = time.time() for platform, (pre_time, cls) in self._generators.items(): if cur_time - pre_time > cls.EXPRIED: TDDCLogging.debug('Generating Cookies [%s].' % platform) cookies_info[platform] = cls().cookies self._generators[platform][0] = cur_time TDDCLogging.debug('Generated Cookies [%s].' % platform) TDDCLogging.debug(json.dumps(cookies_info)) TDDCLogging.debug('Generated.') gevent.sleep(5)
def _push_parse_task(self): TDDCLogging.info('--->Parse Task Producer Was Ready.') while True: task, status = CrawlerQueues.PARSE.get() tmp = Task(**task.__dict__) task.status = Task.Status.CRAWL_SUCCESS if not isinstance(task, Task): TDDCLogging.error('') continue if not self._push_task(CrawlerSite.PARSE_TOPIC, tmp): TDDCLogging.error('') else: CrawlerQueues.TASK_STATUS_REMOVE.put(tmp) TDDCLogging.debug('[%s:%s] Crawled Successed(%d).' % (task.platform, task.row_key, status)) self._successed_num += 1 self._successed_pre_min += 1