Example #1
0
    def search(self, agent, task):
        ''' 获取商铺信息列表 '''
        result = {}
        try:
            data, url = yield self.getContent(agent, task)
            el = lxml.etree.HTML(data)
            #mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0]
            mc = el.xpath("//div[@class='r_sub_box']/div[@class='content_detail']")
            try:
                basic_info = NodeService.parse_basic_info(mc[0])
            except IndexError:
                basic_info = {}
            try:
                intro_info = NodeService.parse_intro_info(mc[1])
            except IndexError:
                intro_info = {}
            try:
                extra_info = NodeService.parse_extra_info(mc[2])
                for u in extra_info['extra']:
                    u['link'] = urllib2.urlparse.urljoin(url, u['link'])
            except IndexError:
                extra_info = {}

            result.update(basic_info)
            result.update(intro_info)
            result.update(extra_info)
        except Exception as msg:
            log.debug("Got Something Wrong with Task: %s Error: %s" % (repr(task), repr(msg)))

        returnValue(result)
Example #2
0
    def gotResult(self, data, task, ttype):
        '''
            获取数据。任务分2种。
            1. 商铺信息,需要抓取商铺的商品列表
            2. 商品信息,需要抓取商品的基本信息
        '''
        # TODO refactor this
        if data:
            if ttype == 'extract':
                total_page, hrefs = json.loads(data)
                total_page = int(total_page)
                hrefs = json.loads(hrefs)
                tids = check_duplicate(self.redis, hrefs)
                #save_tasks(self.redis, tids)
                for h in hrefs:
                    tmp_tid = self.new_task_id()
                    log.info(h)
                    tmp_tbody = {'task': h}
                    tmp_task = BaseTask(tmp_tid, tmp_tbody)
                    self.redis.push_list_data('task_queue', cPickle.dumps(tmp_task))

                task = cPickle.loads(task)
                page = task.tbody.get('page', 1)
                if page == 1 and page < total_page:
                    tmp_tbody = task.tbody
                    for p in xrange(page, total_page):
                        tmp_tid = self.new_task_id()
                        tmp_tbody['page'] = p+1
                        tmp_task = BaseTask(tmp_tid, tmp_tbody)
                        self.redis.push_list_data('extract_queue', cPickle.dumps(tmp_task))

            else:
                save_items(json.loads(data))
        else:
            log.debug('Got an invalid task: %s when taking task: %s' % (task, ttype))
Example #3
0
def json_loads(s):
    ''' '''
    try:
        r = json.loads(s)
        return r
    except:
        log.debug("error when json decode: " + s)
        raise
Example #4
0
def json_loads(s):
    ''' '''
    try:
        r = json.loads(s)
        return r
    except:
        log.debug("error when json decode: " + s)
        raise
Example #5
0
    def getContent(self, agent, task):
        ''' download the target page '''
        tbody = task.tbody

        req_url = urllib2.urlparse.urljoin(tbody.get('prefix'),
                                           tbody.get('suffix'))
        url = req_url % (tbody.get('ccode'), tbody.get('page'))
        log.debug('Getting data with url: %s' % url)
        result = yield request(agent, url)
        returnValue(result)
Example #6
0
    def getContent(self, agent, task):
        ''' download the target page '''
        tbody = task.tbody

        req_url = urllib2.urlparse.urljoin(tbody.get('prefix'),
                                           tbody.get('suffix'))
        url = req_url % (tbody.get('ccode'), tbody.get('page'))
        log.debug('Getting data with url: %s' % url)
        result = yield request(agent, url)
        returnValue(result)
Example #7
0
def request(agent, url, headers=None, body=None):
    log.debug('begin request ' + url)
    print agent, agent.request
    if body is None:
        d = agent.request('GET', str(url), headers)
    else:
        d = agent.request('POST', str(url), headers,
                          client.FileBodyProducer(StringIO(body)))
    d.addCallback(_cbRequest)
    return d
Example #8
0
    def getContent(self, agent, task):
        ''' download the target page '''
        task = cPickle.loads(task)
        #task = json.loads(task)
        tbody = task.tbody

        url = tbody.get('task')
        log.debug('Getting data with url: %s' % url)

        result = yield request(agent, url)
        returnValue((result, tbody.get('prefix')))
Example #9
0
    def search(self, agent, task):
        ''' 获取商铺信息列表 '''
        task = cPickle.loads(task)
        pages, hrefs = -1, None
        try:
            data = yield self.getContent(agent, task)
            el = lxml.etree.HTML(data)
            mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0]
            pages = NodeService.parse_pages(mc)
            hrefs = map(lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x), NodeService.parse_items(mc))
        except Exception as msg:
            log.debug("Got Something Wrong with url: %s Error: %s" % (repr(task), repr(msg)))

        returnValue((pages, json.dumps(hrefs)))
Example #10
0
    def search(self, agent, task):
        ''' 获取商铺信息列表 '''
        task = cPickle.loads(task)
        pages, hrefs = -1, None
        try:
            data = yield self.getContent(agent, task)
            el = lxml.etree.HTML(data)
            mc = el.xpath(
                "//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']"
            )[0]
            pages = NodeService.parse_pages(mc)
            hrefs = map(
                lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x
                                                   ),
                NodeService.parse_items(mc))
        except Exception as msg:
            log.debug("Got Something Wrong with url: %s Error: %s" %
                      (repr(task), repr(msg)))

        returnValue((pages, json.dumps(hrefs)))
Example #11
0
    def gotResult(self, data, task, ttype):
        '''
            获取数据。任务分2种。
            1. 商铺信息,需要抓取商铺的商品列表
            2. 商品信息,需要抓取商品的基本信息
        '''
        # TODO refactor this
        if data:
            if ttype == 'extract':
                total_page, hrefs = json.loads(data)
                total_page = int(total_page)
                hrefs = json.loads(hrefs)
                tids = check_duplicate(self.redis, hrefs)
                #save_tasks(self.redis, tids)
                for h in hrefs:
                    tmp_tid = self.new_task_id()
                    log.info(h)
                    tmp_tbody = {'task': h}
                    tmp_task = BaseTask(tmp_tid, tmp_tbody)
                    self.redis.push_list_data('task_queue',
                                              cPickle.dumps(tmp_task))

                task = cPickle.loads(task)
                page = task.tbody.get('page', 1)
                if page == 1 and page < total_page:
                    tmp_tbody = task.tbody
                    for p in xrange(page, total_page):
                        tmp_tid = self.new_task_id()
                        tmp_tbody['page'] = p + 1
                        tmp_task = BaseTask(tmp_tid, tmp_tbody)
                        self.redis.push_list_data('extract_queue',
                                                  cPickle.dumps(tmp_task))

            else:
                save_items(json.loads(data))
        else:
            log.debug('Got an invalid task: %s when taking task: %s' %
                      (task, ttype))
Example #12
0
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            result = None
            if agent.remove:
                self.agent_pool.removeAgent(agent)
                break

            reqid, task = yield self.callController('nextRequest', 'extract')
            log.info(repr(task))

            try:
                result = yield self.search(agent, task)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", task=task)
                needbreak = True
            except:
                log.exception()
            self.callController('sendResult', reqid, task, json.dumps(result))
            if needbreak:
                break
Example #13
0
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            result = None
            if agent.remove:
                self.agent_pool.removeAgent(agent)
                break

            reqid, task = yield self.callController('nextRequest', 'data')
            log.info('Got Task %s with reqid: %s' % (repr(task), reqid))

            try:
                result = yield self.search(agent, task)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", task=task)
                needbreak = True
            except:
                log.exception()
            self.callController('sendResult', reqid, task, json.dumps(result))
            if needbreak:
                break
Example #14
0
 def __getattr__(self, option):
     try:
         return self.get(option)
     except NoOptionError:
         log.debug("Got an unknown config: %s" % option)
         return None
Example #15
0
 def __getattr__(self, option):
     try:
         return self.get(option)
     except NoOptionError:
         log.debug("Got an unknown config: %s" % option)
         return None