def search(self, agent, task): ''' 获取商铺信息列表 ''' result = {} try: data, url = yield self.getContent(agent, task) el = lxml.etree.HTML(data) #mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0] mc = el.xpath("//div[@class='r_sub_box']/div[@class='content_detail']") try: basic_info = NodeService.parse_basic_info(mc[0]) except IndexError: basic_info = {} try: intro_info = NodeService.parse_intro_info(mc[1]) except IndexError: intro_info = {} try: extra_info = NodeService.parse_extra_info(mc[2]) for u in extra_info['extra']: u['link'] = urllib2.urlparse.urljoin(url, u['link']) except IndexError: extra_info = {} result.update(basic_info) result.update(intro_info) result.update(extra_info) except Exception as msg: log.debug("Got Something Wrong with Task: %s Error: %s" % (repr(task), repr(msg))) returnValue(result)
def gotResult(self, data, task, ttype): ''' 获取数据。任务分2种。 1. 商铺信息,需要抓取商铺的商品列表 2. 商品信息,需要抓取商品的基本信息 ''' # TODO refactor this if data: if ttype == 'extract': total_page, hrefs = json.loads(data) total_page = int(total_page) hrefs = json.loads(hrefs) tids = check_duplicate(self.redis, hrefs) #save_tasks(self.redis, tids) for h in hrefs: tmp_tid = self.new_task_id() log.info(h) tmp_tbody = {'task': h} tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('task_queue', cPickle.dumps(tmp_task)) task = cPickle.loads(task) page = task.tbody.get('page', 1) if page == 1 and page < total_page: tmp_tbody = task.tbody for p in xrange(page, total_page): tmp_tid = self.new_task_id() tmp_tbody['page'] = p+1 tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('extract_queue', cPickle.dumps(tmp_task)) else: save_items(json.loads(data)) else: log.debug('Got an invalid task: %s when taking task: %s' % (task, ttype))
def json_loads(s): ''' ''' try: r = json.loads(s) return r except: log.debug("error when json decode: " + s) raise
def getContent(self, agent, task): ''' download the target page ''' tbody = task.tbody req_url = urllib2.urlparse.urljoin(tbody.get('prefix'), tbody.get('suffix')) url = req_url % (tbody.get('ccode'), tbody.get('page')) log.debug('Getting data with url: %s' % url) result = yield request(agent, url) returnValue(result)
def request(agent, url, headers=None, body=None): log.debug('begin request ' + url) print agent, agent.request if body is None: d = agent.request('GET', str(url), headers) else: d = agent.request('POST', str(url), headers, client.FileBodyProducer(StringIO(body))) d.addCallback(_cbRequest) return d
def getContent(self, agent, task): ''' download the target page ''' task = cPickle.loads(task) #task = json.loads(task) tbody = task.tbody url = tbody.get('task') log.debug('Getting data with url: %s' % url) result = yield request(agent, url) returnValue((result, tbody.get('prefix')))
def search(self, agent, task): ''' 获取商铺信息列表 ''' task = cPickle.loads(task) pages, hrefs = -1, None try: data = yield self.getContent(agent, task) el = lxml.etree.HTML(data) mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0] pages = NodeService.parse_pages(mc) hrefs = map(lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x), NodeService.parse_items(mc)) except Exception as msg: log.debug("Got Something Wrong with url: %s Error: %s" % (repr(task), repr(msg))) returnValue((pages, json.dumps(hrefs)))
def search(self, agent, task): ''' 获取商铺信息列表 ''' task = cPickle.loads(task) pages, hrefs = -1, None try: data = yield self.getContent(agent, task) el = lxml.etree.HTML(data) mc = el.xpath( "//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']" )[0] pages = NodeService.parse_pages(mc) hrefs = map( lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x ), NodeService.parse_items(mc)) except Exception as msg: log.debug("Got Something Wrong with url: %s Error: %s" % (repr(task), repr(msg))) returnValue((pages, json.dumps(hrefs)))
def gotResult(self, data, task, ttype): ''' 获取数据。任务分2种。 1. 商铺信息,需要抓取商铺的商品列表 2. 商品信息,需要抓取商品的基本信息 ''' # TODO refactor this if data: if ttype == 'extract': total_page, hrefs = json.loads(data) total_page = int(total_page) hrefs = json.loads(hrefs) tids = check_duplicate(self.redis, hrefs) #save_tasks(self.redis, tids) for h in hrefs: tmp_tid = self.new_task_id() log.info(h) tmp_tbody = {'task': h} tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('task_queue', cPickle.dumps(tmp_task)) task = cPickle.loads(task) page = task.tbody.get('page', 1) if page == 1 and page < total_page: tmp_tbody = task.tbody for p in xrange(page, total_page): tmp_tid = self.new_task_id() tmp_tbody['page'] = p + 1 tmp_task = BaseTask(tmp_tid, tmp_tbody) self.redis.push_list_data('extract_queue', cPickle.dumps(tmp_task)) else: save_items(json.loads(data)) else: log.debug('Got an invalid task: %s when taking task: %s' % (task, ttype))
def searchLoop(self, agent): ''' ''' needbreak = False while 1: result = None if agent.remove: self.agent_pool.removeAgent(agent) break reqid, task = yield self.callController('nextRequest', 'extract') log.info(repr(task)) try: result = yield self.search(agent, task) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", task=task) needbreak = True except: log.exception() self.callController('sendResult', reqid, task, json.dumps(result)) if needbreak: break
def searchLoop(self, agent): ''' ''' needbreak = False while 1: result = None if agent.remove: self.agent_pool.removeAgent(agent) break reqid, task = yield self.callController('nextRequest', 'data') log.info('Got Task %s with reqid: %s' % (repr(task), reqid)) try: result = yield self.search(agent, task) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", task=task) needbreak = True except: log.exception() self.callController('sendResult', reqid, task, json.dumps(result)) if needbreak: break
def __getattr__(self, option): try: return self.get(option) except NoOptionError: log.debug("Got an unknown config: %s" % option) return None