Example #1
0
    def process_item(self, item, spider):
        try:
            if isinstance(item, XueqiuItem):
                if item.get('article_content'):
                    item['article_content'] = self.parse_content(item['article_content'])
                    if item['article_content'] == None or item['article_content']=="":
                        item['article_content'] = item['article_title'].strip()
                else:
                    item['article_content'] = item['article_title'].strip()
                if item.get('article_title'):
                    item['article_title'] = item['article_title'].strip()[0:200]
                if item.get('release_time'):
                    item['release_time'] = item['release_time'].strip()
                    item['release_time'] = self.parse_time(item['release_time'])

                if item['author_province'] == None or item['author_province'] == '':
                    item['author_province'] = u"未知"

                if item['author_city'] == None or item['author_city'] == "":
                    item['author_city'] = u"未知"

                if item['author_description'] == None or item['author_description'] == "":
                    item['author_description'] = u'暂无'
            return item
        except Exception as ee:
            log_error("XueqiuPipeline.process_item Error:{}".format(ee))
            log_error(traceback.format_exc())
Example #2
0
 def parse_article_url(self, response):
     try:
         response_data = json.loads(response.text)
         article_lists = response_data['statuses']
         for article_dict in article_lists:
             widget_info = {}
             widget_info['like_count'] = article_dict['like_count']
             widget_info['comment_count'] = article_dict['reply_count']
             widget_info['forward_count'] = article_dict['retweet_count']
             widget_info['article_id'] = article_dict['id']
             widget_info['user_id'] = article_dict['user_id']
             id = article_dict['id']
             user_id = article_dict['user_id']
             #获取作者新信息
             author_info = {}
             user_list = article_dict['user']
             author_info['author_name'] = user_list['screen_name']
             author_info['author_province'] = user_list['province']
             author_info['author_city'] = user_list['city']
             author_info['author_description'] = user_list['description']
             author_info['friends_count'] = user_list['friends_count']
             author_info['followers_count'] = user_list['followers_count']
             author_info['author_id'] = user_list['id']
             author_info['author_type'] = 1
             request_url = 'https://xueqiu.com/{0}/{1}'.format(user_id, id)
             yield Request(url=request_url,
                           callback=self.parse_article_page,
                           meta={
                               'widget_info': widget_info,
                               'author_info': author_info
                           },
                           dont_filter=settings.TYPE1_FILTER)
     except Exception as ee:
         log_error("Spider xueqiu.parse_article_url Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #3
0
 def process_item(self,item,spider):
     try:
         self.r.hset(self.redis_data_dict,item['article_url'],0)
         return item
     except Exception as ee:
         log_error("RedisPipeline.process_item Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #4
0
 def open_spider(self,spider):
     try:
         self.r = redis.StrictRedis(host=self.redis_host,port=self.redis_port,db=self.redis_db)
         self.redis_data_dict = 'url_hash'
     except Exception as ee:
         log_error("RedisPipeline.open_spider Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #5
0
 def deal_time(self,original_time):
     try:
         if re.match(u'今天.*', original_time):
             date_time = re.match(u'今天(.*)', original_time).group(1).strip()
             today = datetime.date.today()
             date_time = today.strftime('%Y-%m-%d') + ' ' + date_time
         elif re.match(u'昨天.*', original_time):
             date_time = re.match(u'昨天(.*)', original_time).group(1).strip()
             today = datetime.date.today()
             yesterday = today - datetime.timedelta(days=1)
             date_time = yesterday.strftime('%Y-%m-%d') + ' ' + date_time
         elif re.match(u'.*小时前', original_time):
             date_time = re.match(u'(.*)小时前', original_time).group(1).strip()
             current_time = datetime.datetime.now()
             real_time = current_time - datetime.timedelta(hours=int(date_time))
             date_time = real_time.strftime('%Y-%m-%d %H:%M')
         elif re.match(u'.*分钟前', original_time):
             date_time = re.match(u'(.*)分钟前', original_time).group(1).strip()
             current_time = datetime.datetime.now()
             real_time = current_time - datetime.timedelta(minutes=int(date_time))
             date_time = real_time.strftime('%Y-%m-%d %H:%M')
         elif len(original_time) < 12:
             date_time = time.strftime("%Y", time.localtime()) + '-' + original_time
         else:
             date_time = original_time
         return date_time
     except Exception as ee:
         log_error("XueqiuPipeline.deal_time Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #6
0
 def open_spider(self,spider):
     try:
         self.connect = pymysql.connect(**self.dbparams)
         self.cursor = self.connect.cursor()
     except Exception as ee:
         log_error("MysqlPipeline.open_spider Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #7
0
 def start_requests(self):
     try:
         random_id = ''.join(
             str(random.choice(range(10))) for _ in range(13))
         request_url = 'https://xueqiu.com/recommend/user/industry.json?detail=1&_={0}'.format(
             random_id)
         yield Request(url=request_url,
                       callback=self.parse_people_list,
                       dont_filter=settings.TYPE1_FILTER)
     except Exception as ee:
         log_error("Spider xueqiu.start_request Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #8
0
 def parse_content(self,article_content):
     try:
         if isinstance(article_content, list):
             for i, item in enumerate(article_content):
                 article_content[i] = item.replace(u'\xa0', '').replace('\t', '').replace('\n', '').strip()
             content = ''.join(article_content)
         else:
             content = article_content
         return content
     except Exception as ee:
         log_error("XueqiuPipeline.parse_content Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #9
0
 def parse_time(self, date_time):
     try:
         if re.match(u'发布于.*',date_time):
             re_data_time = re.match(u'发布于(.*)',date_time).group(1)
             date_time = self.deal_time(re_data_time)
         if re.match(u'修改于.*',date_time):
             re_data_time = re.match(u'修改于(.*)',date_time).group(1)
             date_time = self.deal_time(re_data_time)
         return date_time
     except Exception as ee:
         log_error("XueqiuPipeline.parse_time Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #10
0
 def parse_people_page(self, response):
     try:
         response_data = json.loads(response.text)
         response_url = response.url
         max_page = response_data['maxPage']
         for page in range(int(max_page)):
             random_id = ''.join(
                 str(random.choice(range(10))) for _ in range(13))
             based_url = ''.join(list(response_url)[:-13]) + random_id
             request_url = based_url + '&page=' + str(page + 1)
             yield Request(url=request_url,
                           callback=self.parse_article_url,
                           dont_filter=settings.TYPE1_FILTER)
     except Exception as ee:
         log_error("Spider xueqiu.parse_people_page Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #11
0
 def parse_people_url(self, response):
     try:
         response_data = json.loads(response.text)
         user_lists = response_data['industries'][0]['users']
         for user_list in user_lists:
             #构建每个作者的url
             id = user_list['id']
             random_id = ''.join(
                 str(random.choice(range(10))) for _ in range(13))
             request_url = 'https://xueqiu.com/v4/statuses/user_timeline.json?&user_id={0}&type=2&_={1}'.format(
                 id, random_id)
             yield Request(url=request_url,
                           callback=self.parse_people_page,
                           dont_filter=settings.TYPE1_FILTER)
     except Exception as ee:
         log_error("Spider xueqiu.parse_people_url Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #12
0
 def parse_people_list(self, response):
     try:
         response_data = json.loads(response.text)
         recommend_lists = response_data["list"]
         for recommend_dict in recommend_lists:
             id = recommend_dict['id']
             random_id = ''.join(
                 str(random.choice(range(10))) for _ in range(13))
             request_url = "https://xueqiu.com/recommend/user/industry.json?id={0}&_={1}".format(
                 id, random_id)
             #logging.debug(resquest_url)
             yield Request(url=request_url,
                           callback=self.parse_people_url,
                           dont_filter=settings.TYPE1_FILTER)
     except Exception as ee:
         log_error("Spider xueqiu.parse_people_list Error:{}".format(ee))
         log_error(traceback.format_exc())
Example #13
0
    def process_item(self,item,spider):
        author_info = {
            'author_id': item['author_id'],
            'author_name':item['author_name'],
            'author_province':item['author_province'],
            'author_city':item['author_city'],
            'author_description':item['author_description'],
            'author_friends_count':item['friends_count'],
            'author_followers_count':item['followers_count'],
            'author_type': item['author_type']
        }
        author_info_keys = ','.join(author_info.keys())
        author_info_values = ','.join(['%s'] * len(author_info))

        article_info = {
            "index_id":item['index_id'],
            'author_id':item['author_id'],
            'article_id':item['article_id'],
            'article_url':item['article_url'],
            'article_title':item['article_title'],
            'article_content':item['article_content'],
            'release_time':item['release_time'],
            'content_source':item['content_source'],
            'forward_count':item['forward_count'],
            'comment_count':item['comment_count'],
            'like_count':item['like_count'],
        }
        article_info_keys = ','.join(article_info.keys())
        article_info_values = ','.join(['%s'] * len(article_info))
        try:
            #将作者信息存入作者信息的数据表中
            insert_author_sql = 'insert into author_info(%s) values (%s) ON duplicate KEY UPDATE author_id=author_id' %(author_info_keys,author_info_values)
            self.cursor.execute(insert_author_sql,tuple(author_info.values()))
            #文章信息存入文章信息的数据表中
            insert_article_sql = 'insert into article_info(%s) values (%s) ON duplicate KEY UPDATE index_id=index_id' %(article_info_keys,article_info_values)
            self.cursor.execute(insert_article_sql, tuple(article_info.values()))
            self.connect.commit()
            #self.logger.info('Success save to MySQL')
            log_info("Success save to MySQL")
            return item
        except Exception as ee:
            log_error("MysqlPipeline.process_item Error:{}".format(ee))
            log_error(traceback.format_exc())
Example #14
0
    def parse_article_page(self, response):
        try:
            #获取文章信息
            widget_info = response.meta['widget_info']
            #获取文章的发布时间
            release_time = response.xpath(
                "//div[@class='avatar__subtitle']/a/text()").extract_first()
            #获取文章的来源
            isinstance_source = response.xpath(
                "//article[@class='article__bd']/div[@class='article__bd__from']"
            )
            if isinstance_source:
                content_source = isinstance_source.xpath(
                    'a/text()').extract_first()
            else:
                content_source = response.xpath(
                    "//div[@class='avatar__name']/a/text()").extract_first()
            #文章的点赞数
            like_count = widget_info['like_count']
            #文章的评论数
            comment_count = widget_info['comment_count']
            #文章的转发数
            forward_count = widget_info['forward_count']
            #用户id
            user_id = widget_info['user_id']
            #文章id
            article_id = widget_info['article_id']
            #该文章的索引id
            index_id = str(user_id) + '-' + str(article_id)
            isinstance_title = response.xpath(
                "//article[@class='article__bd']/h1")
            isinstance_content = response.xpath(
                "//div[@class='article__bd__detail']/p")
            #获取文章的标题和内容
            if isinstance_title:
                article_title = ''.join(
                    isinstance_title.xpath('text()').extract())
                if isinstance_content:
                    article_content = response.xpath(
                        "//div[@class='article__bd__detail']/p/text()"
                    ).extract()
                else:
                    article_content = response.xpath(
                        "//div[@class='article__bd__detail']/text()").extract(
                        )
            else:
                is_title_exist = response.xpath(
                    "//div[@class='article__bd__detail']/text()").extract()
                if is_title_exist:
                    try:
                        article_title = ''.join(is_title_exist[0])
                        article_content = is_title_exist[1:]
                    except IndexError:
                        article_title = 'Wrong article {0}'.format(
                            response.url)
                        article_content = 'Wrong article {0}'.format(
                            response.url)
                        self.logger.error('Failed to get content {0}'.format(
                            response.url))
                else:
                    article_title = 'None'
                    article_content = response.xpath(
                        "//div[@class='article__bd__detail']/p/text()"
                    ).extract()
            article_url = response.url

            #获取作者信息
            author_info = response.meta['author_info']
            author_name = author_info['author_name']
            author_province = author_info['author_province']
            author_city = author_info['author_city']
            author_description = author_info['author_description']
            friends_count = author_info['friends_count']
            followers_count = author_info['followers_count']
            author_id = author_info['author_id']
            author_type = author_info['author_type']

            xueqiu_item = XueqiuItem()
            for field in xueqiu_item.fields:
                try:
                    xueqiu_item[field] = eval(field)
                except NameError:
                    self.logger.error('Field is Not Defined')
            yield xueqiu_item
        except Exception as ee:
            log_error("Spider xueqiu.parse_article_page Error:{}".format(ee))
            log_error(traceback.format_exc())
Example #15
0
    def process_response(self, request, response, spider):

        if response.status == 400:
            try:
                log_error('Cookie Invalidation url:{0}'.format(response.url))
                self.logger.warning('Cookie Invalidation url:{0}'.format(
                    response.url))
            except Exception as ee:
                log_error('Cookie Invalidation ERROR {}'.format(ee))
                log_error(traceback.format_exc())
        if response.status == 404:
            try:
                log_error('ERROR 404 url:{0}'.format(response.url))
                self.logger.warning('ERROR 404 url:{0}'.format(response.url))
            except Exception as ee:
                log_error('404 Error {}'.format(ee))
                log_error(traceback.format_exc())
        if response.status in [501, 502, 503, 504]:
            #self.logger.info('IP use too match to enter waiting')
            log_info("IP use too match to enter waiting url:{}".format(
                response.url))
            time.sleep(300)
            return request
        if response.status in [302]:
            self.logger.warning("302 Error IP Invalidation")
            log_error("302 Error IP Invalidation")
            return request
        else:
            return response