def process_item(self, item, spider): try: if isinstance(item, XueqiuItem): if item.get('article_content'): item['article_content'] = self.parse_content(item['article_content']) if item['article_content'] == None or item['article_content']=="": item['article_content'] = item['article_title'].strip() else: item['article_content'] = item['article_title'].strip() if item.get('article_title'): item['article_title'] = item['article_title'].strip()[0:200] if item.get('release_time'): item['release_time'] = item['release_time'].strip() item['release_time'] = self.parse_time(item['release_time']) if item['author_province'] == None or item['author_province'] == '': item['author_province'] = u"未知" if item['author_city'] == None or item['author_city'] == "": item['author_city'] = u"未知" if item['author_description'] == None or item['author_description'] == "": item['author_description'] = u'暂无' return item except Exception as ee: log_error("XueqiuPipeline.process_item Error:{}".format(ee)) log_error(traceback.format_exc())
def parse_article_url(self, response): try: response_data = json.loads(response.text) article_lists = response_data['statuses'] for article_dict in article_lists: widget_info = {} widget_info['like_count'] = article_dict['like_count'] widget_info['comment_count'] = article_dict['reply_count'] widget_info['forward_count'] = article_dict['retweet_count'] widget_info['article_id'] = article_dict['id'] widget_info['user_id'] = article_dict['user_id'] id = article_dict['id'] user_id = article_dict['user_id'] #获取作者新信息 author_info = {} user_list = article_dict['user'] author_info['author_name'] = user_list['screen_name'] author_info['author_province'] = user_list['province'] author_info['author_city'] = user_list['city'] author_info['author_description'] = user_list['description'] author_info['friends_count'] = user_list['friends_count'] author_info['followers_count'] = user_list['followers_count'] author_info['author_id'] = user_list['id'] author_info['author_type'] = 1 request_url = 'https://xueqiu.com/{0}/{1}'.format(user_id, id) yield Request(url=request_url, callback=self.parse_article_page, meta={ 'widget_info': widget_info, 'author_info': author_info }, dont_filter=settings.TYPE1_FILTER) except Exception as ee: log_error("Spider xueqiu.parse_article_url Error:{}".format(ee)) log_error(traceback.format_exc())
def process_item(self,item,spider): try: self.r.hset(self.redis_data_dict,item['article_url'],0) return item except Exception as ee: log_error("RedisPipeline.process_item Error:{}".format(ee)) log_error(traceback.format_exc())
def open_spider(self,spider): try: self.r = redis.StrictRedis(host=self.redis_host,port=self.redis_port,db=self.redis_db) self.redis_data_dict = 'url_hash' except Exception as ee: log_error("RedisPipeline.open_spider Error:{}".format(ee)) log_error(traceback.format_exc())
def deal_time(self,original_time): try: if re.match(u'今天.*', original_time): date_time = re.match(u'今天(.*)', original_time).group(1).strip() today = datetime.date.today() date_time = today.strftime('%Y-%m-%d') + ' ' + date_time elif re.match(u'昨天.*', original_time): date_time = re.match(u'昨天(.*)', original_time).group(1).strip() today = datetime.date.today() yesterday = today - datetime.timedelta(days=1) date_time = yesterday.strftime('%Y-%m-%d') + ' ' + date_time elif re.match(u'.*小时前', original_time): date_time = re.match(u'(.*)小时前', original_time).group(1).strip() current_time = datetime.datetime.now() real_time = current_time - datetime.timedelta(hours=int(date_time)) date_time = real_time.strftime('%Y-%m-%d %H:%M') elif re.match(u'.*分钟前', original_time): date_time = re.match(u'(.*)分钟前', original_time).group(1).strip() current_time = datetime.datetime.now() real_time = current_time - datetime.timedelta(minutes=int(date_time)) date_time = real_time.strftime('%Y-%m-%d %H:%M') elif len(original_time) < 12: date_time = time.strftime("%Y", time.localtime()) + '-' + original_time else: date_time = original_time return date_time except Exception as ee: log_error("XueqiuPipeline.deal_time Error:{}".format(ee)) log_error(traceback.format_exc())
def open_spider(self,spider): try: self.connect = pymysql.connect(**self.dbparams) self.cursor = self.connect.cursor() except Exception as ee: log_error("MysqlPipeline.open_spider Error:{}".format(ee)) log_error(traceback.format_exc())
def start_requests(self): try: random_id = ''.join( str(random.choice(range(10))) for _ in range(13)) request_url = 'https://xueqiu.com/recommend/user/industry.json?detail=1&_={0}'.format( random_id) yield Request(url=request_url, callback=self.parse_people_list, dont_filter=settings.TYPE1_FILTER) except Exception as ee: log_error("Spider xueqiu.start_request Error:{}".format(ee)) log_error(traceback.format_exc())
def parse_content(self,article_content): try: if isinstance(article_content, list): for i, item in enumerate(article_content): article_content[i] = item.replace(u'\xa0', '').replace('\t', '').replace('\n', '').strip() content = ''.join(article_content) else: content = article_content return content except Exception as ee: log_error("XueqiuPipeline.parse_content Error:{}".format(ee)) log_error(traceback.format_exc())
def parse_time(self, date_time): try: if re.match(u'发布于.*',date_time): re_data_time = re.match(u'发布于(.*)',date_time).group(1) date_time = self.deal_time(re_data_time) if re.match(u'修改于.*',date_time): re_data_time = re.match(u'修改于(.*)',date_time).group(1) date_time = self.deal_time(re_data_time) return date_time except Exception as ee: log_error("XueqiuPipeline.parse_time Error:{}".format(ee)) log_error(traceback.format_exc())
def parse_people_page(self, response): try: response_data = json.loads(response.text) response_url = response.url max_page = response_data['maxPage'] for page in range(int(max_page)): random_id = ''.join( str(random.choice(range(10))) for _ in range(13)) based_url = ''.join(list(response_url)[:-13]) + random_id request_url = based_url + '&page=' + str(page + 1) yield Request(url=request_url, callback=self.parse_article_url, dont_filter=settings.TYPE1_FILTER) except Exception as ee: log_error("Spider xueqiu.parse_people_page Error:{}".format(ee)) log_error(traceback.format_exc())
def parse_people_url(self, response): try: response_data = json.loads(response.text) user_lists = response_data['industries'][0]['users'] for user_list in user_lists: #构建每个作者的url id = user_list['id'] random_id = ''.join( str(random.choice(range(10))) for _ in range(13)) request_url = 'https://xueqiu.com/v4/statuses/user_timeline.json?&user_id={0}&type=2&_={1}'.format( id, random_id) yield Request(url=request_url, callback=self.parse_people_page, dont_filter=settings.TYPE1_FILTER) except Exception as ee: log_error("Spider xueqiu.parse_people_url Error:{}".format(ee)) log_error(traceback.format_exc())
def parse_people_list(self, response): try: response_data = json.loads(response.text) recommend_lists = response_data["list"] for recommend_dict in recommend_lists: id = recommend_dict['id'] random_id = ''.join( str(random.choice(range(10))) for _ in range(13)) request_url = "https://xueqiu.com/recommend/user/industry.json?id={0}&_={1}".format( id, random_id) #logging.debug(resquest_url) yield Request(url=request_url, callback=self.parse_people_url, dont_filter=settings.TYPE1_FILTER) except Exception as ee: log_error("Spider xueqiu.parse_people_list Error:{}".format(ee)) log_error(traceback.format_exc())
def process_item(self,item,spider): author_info = { 'author_id': item['author_id'], 'author_name':item['author_name'], 'author_province':item['author_province'], 'author_city':item['author_city'], 'author_description':item['author_description'], 'author_friends_count':item['friends_count'], 'author_followers_count':item['followers_count'], 'author_type': item['author_type'] } author_info_keys = ','.join(author_info.keys()) author_info_values = ','.join(['%s'] * len(author_info)) article_info = { "index_id":item['index_id'], 'author_id':item['author_id'], 'article_id':item['article_id'], 'article_url':item['article_url'], 'article_title':item['article_title'], 'article_content':item['article_content'], 'release_time':item['release_time'], 'content_source':item['content_source'], 'forward_count':item['forward_count'], 'comment_count':item['comment_count'], 'like_count':item['like_count'], } article_info_keys = ','.join(article_info.keys()) article_info_values = ','.join(['%s'] * len(article_info)) try: #将作者信息存入作者信息的数据表中 insert_author_sql = 'insert into author_info(%s) values (%s) ON duplicate KEY UPDATE author_id=author_id' %(author_info_keys,author_info_values) self.cursor.execute(insert_author_sql,tuple(author_info.values())) #文章信息存入文章信息的数据表中 insert_article_sql = 'insert into article_info(%s) values (%s) ON duplicate KEY UPDATE index_id=index_id' %(article_info_keys,article_info_values) self.cursor.execute(insert_article_sql, tuple(article_info.values())) self.connect.commit() #self.logger.info('Success save to MySQL') log_info("Success save to MySQL") return item except Exception as ee: log_error("MysqlPipeline.process_item Error:{}".format(ee)) log_error(traceback.format_exc())
def parse_article_page(self, response): try: #获取文章信息 widget_info = response.meta['widget_info'] #获取文章的发布时间 release_time = response.xpath( "//div[@class='avatar__subtitle']/a/text()").extract_first() #获取文章的来源 isinstance_source = response.xpath( "//article[@class='article__bd']/div[@class='article__bd__from']" ) if isinstance_source: content_source = isinstance_source.xpath( 'a/text()').extract_first() else: content_source = response.xpath( "//div[@class='avatar__name']/a/text()").extract_first() #文章的点赞数 like_count = widget_info['like_count'] #文章的评论数 comment_count = widget_info['comment_count'] #文章的转发数 forward_count = widget_info['forward_count'] #用户id user_id = widget_info['user_id'] #文章id article_id = widget_info['article_id'] #该文章的索引id index_id = str(user_id) + '-' + str(article_id) isinstance_title = response.xpath( "//article[@class='article__bd']/h1") isinstance_content = response.xpath( "//div[@class='article__bd__detail']/p") #获取文章的标题和内容 if isinstance_title: article_title = ''.join( isinstance_title.xpath('text()').extract()) if isinstance_content: article_content = response.xpath( "//div[@class='article__bd__detail']/p/text()" ).extract() else: article_content = response.xpath( "//div[@class='article__bd__detail']/text()").extract( ) else: is_title_exist = response.xpath( "//div[@class='article__bd__detail']/text()").extract() if is_title_exist: try: article_title = ''.join(is_title_exist[0]) article_content = is_title_exist[1:] except IndexError: article_title = 'Wrong article {0}'.format( response.url) article_content = 'Wrong article {0}'.format( response.url) self.logger.error('Failed to get content {0}'.format( response.url)) else: article_title = 'None' article_content = response.xpath( "//div[@class='article__bd__detail']/p/text()" ).extract() article_url = response.url #获取作者信息 author_info = response.meta['author_info'] author_name = author_info['author_name'] author_province = author_info['author_province'] author_city = author_info['author_city'] author_description = author_info['author_description'] friends_count = author_info['friends_count'] followers_count = author_info['followers_count'] author_id = author_info['author_id'] author_type = author_info['author_type'] xueqiu_item = XueqiuItem() for field in xueqiu_item.fields: try: xueqiu_item[field] = eval(field) except NameError: self.logger.error('Field is Not Defined') yield xueqiu_item except Exception as ee: log_error("Spider xueqiu.parse_article_page Error:{}".format(ee)) log_error(traceback.format_exc())
def process_response(self, request, response, spider): if response.status == 400: try: log_error('Cookie Invalidation url:{0}'.format(response.url)) self.logger.warning('Cookie Invalidation url:{0}'.format( response.url)) except Exception as ee: log_error('Cookie Invalidation ERROR {}'.format(ee)) log_error(traceback.format_exc()) if response.status == 404: try: log_error('ERROR 404 url:{0}'.format(response.url)) self.logger.warning('ERROR 404 url:{0}'.format(response.url)) except Exception as ee: log_error('404 Error {}'.format(ee)) log_error(traceback.format_exc()) if response.status in [501, 502, 503, 504]: #self.logger.info('IP use too match to enter waiting') log_info("IP use too match to enter waiting url:{}".format( response.url)) time.sleep(300) return request if response.status in [302]: self.logger.warning("302 Error IP Invalidation") log_error("302 Error IP Invalidation") return request else: return response