Exemple #1
0
    def parse(self, response):
        try:
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                item = XQItem()
                body = re.sub('[\s]', '', response.body.decode('utf-8'))
                body = json.loads(body)

                if body:
                    total_num = len(body[0]['list'])
                    for i in range(total_num - 1, -1, -1):
                        content = body[0]['list'][i]
                        if content['date'] < self.start_time:
                            return
                        else:
                            content['cube_symbol'] = response.meta['symbol']
                            content['cube_type'] = response.meta['cube_type']
                            item['url'] = response.url
                            item['content'] = content
                            item['fp'] = request_fingerprint(response.request)
                            yield item

            if response.status == 302 or str(
                    response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: %s' %
                                  (response.meta['symbol']))
                oldmeta = response.request.meta
                oldmeta["change_proxy"] = True
                yield Request(url=response.request.url,
                              meta=oldmeta,
                              callback=self.parse)
        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Exemple #2
0
    def parse(self, response):
        try:
            #print(response.url)
            #print(response.status)
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                item = XQItem()
                hxs = Selector(response)
                info_script = ''.join(
                    hxs.xpath(
                        '//script[contains(., "cubeInfo")]//text()').extract())
                info_script = re.sub("[\s]", "", info_script)
                m = re.search("SNB.cubeInfo=({\S+?});SNB.cube", info_script)
                if m:
                    content = json.loads(m.group(1).strip())
                    content['lastcrawl'] = int(time.time())
                    content['cube_type'] = response.meta['cube_type']
                    item['content'] = content
                    item['fp'] = request_fingerprint(response.request)
                    item['url'] = response.url
                    yield item
            # 返回404,但是非验证码情况,说明对应的cube symbol不存在,这些url也要写入redis,避免下次再进行抓取
            elif response.status == 404 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                item = XQItem()
                item['fp'] = request_fingerprint(response.request)
                item['url'] = response.url
                yield item
                #self.logger.warn('404: %s' % (str(response.url)))

            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: %s' % (response.url))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Exemple #3
0
    def parse(self, response):
        try:
            if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha":
                content = json.loads(response.body.decode('utf-8'))
                item = XQItem()
                content['user_id'] = response.meta['user_id']
                item['url'] = response.url
                item['content'] = content
                item['fp'] = request_fingerprint(response.request)
                yield item

            if str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['owner_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
Exemple #4
0
    def parse_status(self, response):
        try:
            body = json.loads(response.body.decode('utf-8'))
            content = {}
            content['user_id'] = response.meta['user_id']
            content['statuses'] = body['statuses']
            content['total'] = body['total']
            content['max_page'] = body['maxPage']
            content['page'] = body['page']

            item = XQItem()
            item['content'] = content
            item['fp'] = request_fingerprint(response.request)
            yield item

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Exemple #5
0
    def parse(self, response):
        try:
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                body = json.loads(response.body.decode('utf-8'))
                if body['maxPage']:
                    max_page = body['maxPage']
                    page = body['page']

                if body['statuses']:
                    page_first_time = body['statuses'][0]['created_at']
                    page_first_time = time.gmtime(page_first_time / 1000)
                    if page_first_time < self.start_time:
                        return
                    content = {}
                    content['user_id'] = response.meta['user_id']
                    content['statuses'] = body['statuses']
                    content['total'] = body['total']
                    content['max_page'] = body['maxPage']
                    content['page'] = body['page']

                    item = XQItem()
                    item['content'] = content
                    yield item

                    # Second + page
                    if page < max_page:
                        page = page + 1
                        page_string = '&page=' + str(page)
                        url = re.sub(r'&page=(\d+)', page_string, response.url)
                        yield Request(
                            url=url,
                            meta={'user_id': response.meta['user_id']},
                            callback=self.parse)

            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: User ID %s' %
                                  (response.meta['user_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Exemple #6
0
    def parse(self, response):
        try:
            if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha":
                
                cube_type =  response.meta['cube_type']
                symbol =  response.meta['symbol']
                page = response.meta['page']

                body = re.sub('[\s]', '', response.body.decode('utf-8'))
                body = json.loads(body)

                if body['maxPage']:
                    max_page = body['maxPage']
                 
                if body['list']:
                    page_first_time = body['list'][0]['updated_at']
                    page_first_time = time.gmtime(page_first_time / 1000)
                    if page_first_time < self.start_time:
                        return
                    else:
                        for i in body['list']:
                            item = XQItem()
                            # i is of type dict
                            i['cube_symbol'] = symbol
                            i['cube_type'] = cube_type
                            item['url'] = response.url
                            item['content'] = i
                            item['fp'] = request_fingerprint(response.request)
                            yield item

                    # Second + page
                    if page < max_page:
                        page = page + 1
                        page_string =  '&page=' + str(page)
                        url = re.sub(r'&page=(\d+)', page_string, response.url)
                        yield Request(url = url, meta = {'cube_type':cube_type, 'symbol':symbol, 'page':page}, callback = self.parse)
            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: %s' % (response.meta['symbol']))

        except Exception as ex:
            self.logger.error('Parse Exception: %s %s' % (str(ex), response.url))
Exemple #7
0
    def parse_gz(self, response):
        try:
            body = json.loads(response.body.decode('utf-8'))
            content = {}
            content['user_id'] = response.meta['user_id']

            users = []
            for user in body['users']:
                users.append(user['id'])
            content['follow'] = users
            content['lastcrawl'] = int(time.time())

            item = XQItem()
            item['url'] = response.url
            item['content'] = content

            item['fp'] = request_fingerprint(response.request)
            yield item

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
Exemple #8
0
    def parse(self, response):
        try:
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                body = json.loads(response.body.decode('utf-8'))
                if body['maxPage']:
                    max_page = body['maxPage']
                    page = body['page']

                    # First page
                    if page == 1:
                        content = {}
                        content['user_id'] = response.meta['user_id']
                        content['statuses'] = body['statuses']
                        content['total'] = body['total']
                        content['max_page'] = body['maxPage']
                        content['page'] = body['page']

                        item = XQItem()
                        item['content'] = content
                        yield item

                    # Second + page
                    if max_page > 1:
                        for i in range(2, max_page + 1):
                            url = response.url + '&page=' + str(i)
                            yield Request(
                                url=url,
                                meta={'user_id': response.meta['user_id']},
                                callback=self.parse_status)

            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: User ID %s' %
                                  (response.meta['user_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))