예제 #1
0
 def parse(self, response):
     TDDCLogging.debug('Download Success. ' + response.url)
     task,_ = response.request.meta.get('item')
     rsp_info = {'rsp': [response.url, response.status],
                 'content': response.body}
     if self.signals_callback:
         self.signals_callback(self, SingleSpider.SIGNAL_STORAGE, [task, rsp_info])
예제 #2
0
 def _get_status(self):
     while True:
         cur_time = 1495087998  # time.time()
         keys = self.keys(MonitorSite.STATUS_HSET_PREFIX + '.*')
         for key in keys:
             h_len = self.hlen(key)
             platform, status = key.split('.')[-2:]
             if not self._status.get(platform):
                 self._status[platform] = {}
             self._status[platform][status] = h_len
             item = self.hscan_iter(key)
             for index, (url, task) in enumerate(item):
                 task = json.loads(task)
                 task = Task(**task)
                 time = task.timestamp
                 if int(time) < cur_time - 20:
                     MonitorQueues.EXCEPTION_TASK.put(task)
                     TDDCLogging.debug(
                         str(index) + ' : ' + task.platform + ' : ' + url +
                         ' : ' + str(task.status) + ' : ' + str(time) +
                         ' : ' + 'Crawl Again.')
                     self.hdel(MonitorSite.STATUS_HSET_PREFIX, url)
         gevent.sleep(60)
         TDDCLogging.debug(
             json.dumps(self._status, sort_keys=True, indent=4))
예제 #3
0
 def _task_status_update(self):
     while True:
         task = ParserQueues.TASK_STATUS.get()
         TDDCLogging.debug('[{}:{}:{}]'.format(task.platform, task.url,
                                               task.status))
         self._successed_num += 1
         self._successed_pre_min += 1
예제 #4
0
 def add_task(self, task, is_retry=False, times=1):
     if not is_retry:
         TDDCLogging.debug('Add New Task: ' + task.url)
     headers = self._init_request_headers(task)
     req = (self._make_get_request(task, headers, times) 
            if not task.method or upper(task.method) == 'GET' 
            else self._make_post_request(task, headers, times))
     self.crawler.engine.schedule(req, self)
예제 #5
0
 def _generator(self):
     cookies_info = {}
     while True:
         TDDCLogging.debug('Generating.')
         cur_time = time.time()
         for platform, (pre_time, cls) in self._generators.items():
             if cur_time - pre_time > cls.EXPRIED:
                 TDDCLogging.debug('Generating Cookies [%s].' % platform)
                 cookies_info[platform] = cls().cookies
                 self._generators[platform][0] = cur_time
                 TDDCLogging.debug('Generated Cookies [%s].' % platform)
                 TDDCLogging.debug(json.dumps(cookies_info))
         TDDCLogging.debug('Generated.')
         gevent.sleep(5)
예제 #6
0
 def _push_parse_task(self):
     TDDCLogging.info('--->Parse Task Producer Was Ready.')
     while True:
         task, status = CrawlerQueues.PARSE.get()
         tmp = Task(**task.__dict__)
         task.status = Task.Status.CRAWL_SUCCESS
         if not isinstance(task, Task):
             TDDCLogging.error('')
             continue
         if not self._push_task(CrawlerSite.PARSE_TOPIC, tmp):
             TDDCLogging.error('')
         else:
             CrawlerQueues.TASK_STATUS_REMOVE.put(tmp)
             TDDCLogging.debug('[%s:%s] Crawled Successed(%d).' %
                               (task.platform, task.row_key, status))
             self._successed_num += 1
             self._successed_pre_min += 1