Ejemplo n.º 1
0
 def _push_task(self):
     TDDCLogging.info('--->Task Output Producer Was Ready.')
     while True:
         task = self._queues.TASK_OUTPUT.get()
         if not self._push(self._site.TASK_OUTPUT_TOPIC, task):
             TDDCLogging.error('Push Task Failed.')
         else:
             self.pushed(task)
Ejemplo n.º 2
0
 def _parse(self):
     if not self._json_dict.get('success'):
         TDDCLogging.warning('Crawled[{}:{}] Failed.'.format(
             self._task.platform, self._task.url))
         return
     data = self._json_dict.get('data')
     if not data:
         TDDCLogging.warning('Crawled[{}:{}] Exception.'.format(
             self._task.platform, self._task.url))
         return
     self._get_detail_extra_info(data)
Ejemplo n.º 3
0
 def _consume_msg_exp(self, exp_type, info, exception=None):
     if 'JSON_ERR' in exp_type:
         TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' +
                           info + '\n' + exception.message + '\n' + '*' *
                           (10 + len(exp_type)) + '\n')
     elif 'TASK_ERR' in exp_type or 'EVENT_ERR' in exp_type:
         TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' +
                           'item={item}\n'.format(item=info) +
                           'item_type={item_type}\n'.format(
                               item_type=type(info)) + '*' *
                           (10 + len(exp_type)) + '\n')
Ejemplo n.º 4
0
 def _parse(self):
     if not self._json_dict.get('success'):
         TDDCLogging.warning('Crawled[{}:{}] Failed.'.format(self._task.platform,
                                                             self._task.url))
         return
     data = self._json_dict.get('data')
     if not data:
         TDDCLogging.warning('Crawled[{}:{}] Exception.'.format(self._task.platform,
                                                                self._task.url))
         return
     if data.get('pageIndex') == 1:
         self._make_bid_list_tasks(data)
     self._make_detail_task(data.get('data'))
Ejemplo n.º 5
0
 def _push(self, topic, task, times=0):
     if not task:
         return False
     msg = json.dumps(task.__dict__)
     if msg:
         try:
             if self.ready_to_push(task):
                 self._task_output_producer.send(topic, msg)
         except Exception, e:
             TDDCLogging.warning('Push Task Field: ' + e.message)
             gevent.sleep(1)
             if times == 10:
                 return False
             return self._push(topic, task, times + 1)
         else:
             return True
Ejemplo n.º 6
0
 def _make_detail_task(self, data):
     referer_base_url = 'https://www.weidai.com.cn/bid/showBidDetail?hash={hash}' 
     base_url = 'https://www.weidai.com.cn/bid/bidDetail?hash={hash}&bid='
     for detail_info in data:
         path = detail_info.get('hash')
         if not path:
             TDDCLogging.warning('Path Is None.')
             return
         task = Task()
         task.url = base_url.format(hash=path)
         task.platform = self.platform
         task.feature = 'weidai.bid_detail'
         task.headers = {'Referer': referer_base_url.format(hash=path),
                         'X-Requested-With': 'XMLHttpRequest'}
         self._md5_mk.update(task.url)
         task.row_key = self._md5_mk.hexdigest()
         self.tasks.append(task)
Ejemplo n.º 7
0
 def _fetch_task(self):
     TDDCLogging.info('--->Task Input Consumer Was Ready.')
     pause = False
     while True:
         if self._queues.TASK_INPUT.qsize(
         ) > self._site.LOCAL_TASK_QUEUE_SIZE:
             if not pause:
                 self._task_input_consumer.commit()
                 self._task_input_consumer.unsubscribe()
                 pause = True
                 TDDCLogging.info('Task Input Consumer Was Paused.')
             gevent.sleep(1)
             continue
         if pause and self._queues.TASK_INPUT.qsize(
         ) < self._site.LOCAL_TASK_QUEUE_SIZE / 2:
             self._task_input_consumer.subscribe(
                 self._site.TASK_INPUT_TOPIC)
             pause = False
             TDDCLogging.info('Task Input Consumer Was Resumed.')
         partition_records = self._task_input_consumer.poll(2000, 16)
         if not len(partition_records):
             gevent.sleep(1)
             continue
         for _, records in partition_records.items():
             for record in records:
                 self._record_proc(record)