def parse(self, response): try: if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": item = XQItem() body = re.sub('[\s]', '', response.body.decode('utf-8')) body = json.loads(body) if body: total_num = len(body[0]['list']) for i in range(total_num - 1, -1, -1): content = body[0]['list'][i] if content['date'] < self.start_time: return else: content['cube_symbol'] = response.meta['symbol'] content['cube_type'] = response.meta['cube_type'] item['url'] = response.url item['content'] = content item['fp'] = request_fingerprint(response.request) yield item if response.status == 302 or str( response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: %s' % (response.meta['symbol'])) oldmeta = response.request.meta oldmeta["change_proxy"] = True yield Request(url=response.request.url, meta=oldmeta, callback=self.parse) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse(self, response): try: #print(response.url) #print(response.status) if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": item = XQItem() hxs = Selector(response) info_script = ''.join( hxs.xpath( '//script[contains(., "cubeInfo")]//text()').extract()) info_script = re.sub("[\s]", "", info_script) m = re.search("SNB.cubeInfo=({\S+?});SNB.cube", info_script) if m: content = json.loads(m.group(1).strip()) content['lastcrawl'] = int(time.time()) content['cube_type'] = response.meta['cube_type'] item['content'] = content item['fp'] = request_fingerprint(response.request) item['url'] = response.url yield item # 返回404,但是非验证码情况,说明对应的cube symbol不存在,这些url也要写入redis,避免下次再进行抓取 elif response.status == 404 and str( response.url) != "https://xueqiu.com/service/captcha": item = XQItem() item['fp'] = request_fingerprint(response.request) item['url'] = response.url yield item #self.logger.warn('404: %s' % (str(response.url))) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: %s' % (response.url)) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse(self, response): try: if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha": content = json.loads(response.body.decode('utf-8')) item = XQItem() content['user_id'] = response.meta['user_id'] item['url'] = response.url item['content'] = content item['fp'] = request_fingerprint(response.request) yield item if str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['owner_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse_status(self, response): try: body = json.loads(response.body.decode('utf-8')) content = {} content['user_id'] = response.meta['user_id'] content['statuses'] = body['statuses'] content['total'] = body['total'] content['max_page'] = body['maxPage'] content['page'] = body['page'] item = XQItem() item['content'] = content item['fp'] = request_fingerprint(response.request) yield item except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse(self, response): try: if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": body = json.loads(response.body.decode('utf-8')) if body['maxPage']: max_page = body['maxPage'] page = body['page'] if body['statuses']: page_first_time = body['statuses'][0]['created_at'] page_first_time = time.gmtime(page_first_time / 1000) if page_first_time < self.start_time: return content = {} content['user_id'] = response.meta['user_id'] content['statuses'] = body['statuses'] content['total'] = body['total'] content['max_page'] = body['maxPage'] content['page'] = body['page'] item = XQItem() item['content'] = content yield item # Second + page if page < max_page: page = page + 1 page_string = '&page=' + str(page) url = re.sub(r'&page=(\d+)', page_string, response.url) yield Request( url=url, meta={'user_id': response.meta['user_id']}, callback=self.parse) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['user_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse(self, response): try: if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha": cube_type = response.meta['cube_type'] symbol = response.meta['symbol'] page = response.meta['page'] body = re.sub('[\s]', '', response.body.decode('utf-8')) body = json.loads(body) if body['maxPage']: max_page = body['maxPage'] if body['list']: page_first_time = body['list'][0]['updated_at'] page_first_time = time.gmtime(page_first_time / 1000) if page_first_time < self.start_time: return else: for i in body['list']: item = XQItem() # i is of type dict i['cube_symbol'] = symbol i['cube_type'] = cube_type item['url'] = response.url item['content'] = i item['fp'] = request_fingerprint(response.request) yield item # Second + page if page < max_page: page = page + 1 page_string = '&page=' + str(page) url = re.sub(r'&page=(\d+)', page_string, response.url) yield Request(url = url, meta = {'cube_type':cube_type, 'symbol':symbol, 'page':page}, callback = self.parse) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: %s' % (response.meta['symbol'])) except Exception as ex: self.logger.error('Parse Exception: %s %s' % (str(ex), response.url))
def parse_gz(self, response): try: body = json.loads(response.body.decode('utf-8')) content = {} content['user_id'] = response.meta['user_id'] users = [] for user in body['users']: users.append(user['id']) content['follow'] = users content['lastcrawl'] = int(time.time()) item = XQItem() item['url'] = response.url item['content'] = content item['fp'] = request_fingerprint(response.request) yield item except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
def parse(self, response): try: if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": body = json.loads(response.body.decode('utf-8')) if body['maxPage']: max_page = body['maxPage'] page = body['page'] # First page if page == 1: content = {} content['user_id'] = response.meta['user_id'] content['statuses'] = body['statuses'] content['total'] = body['total'] content['max_page'] = body['maxPage'] content['page'] = body['page'] item = XQItem() item['content'] = content yield item # Second + page if max_page > 1: for i in range(2, max_page + 1): url = response.url + '&page=' + str(i) yield Request( url=url, meta={'user_id': response.meta['user_id']}, callback=self.parse_status) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['user_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))