예제 #1
0
 def parse_user(self, response):
     selector = Selector(response)
     post_selector = selector.xpath('//div[@class="day"]')
     for i, subselector in enumerate(post_selector):
         if i == 0:
             continue
         post = GECnUserBlogPost()
         title = list_first_str(
             subselector.xpath(
                 'div[@class="postTitle"]/a/text()').extract())
         post_link = list_first_str(
             subselector.xpath('div[@class="postTitle"]/a/@href').extract())
         desc = ''.join(
             subselector.xpath(
                 'div[@class="postCon"]/div/text()').extract())
         foot = list_first_str(
             subselector.xpath(
                 'div[@class="postDesc"]/text()').extract()).split(' ')
         post['title'], post['post_link'], post[
             'brief'] = title, post_link, desc
         post['username'], post['time'], post['view_num'], post['comment_num'] \
             = foot[4], foot[2]+' ' + foot[3], foot[5].strip()[3:-1], foot[6].strip()[3:-1]
         post['user_url'] = response.url
         post['post_id'] = get_linkmd5id(post['post_link'])
         logging.info(
             'GECnBlogPopularUserSpider: post\'s info %s is crawled successfully',
             title)
         yield post
예제 #2
0
    def prase_activity(self, response):
        selector = Selector(response)
        item_selector = selector.xpath('//ul[@id="feed_list"]').css(
            'li.feed_item')
        for i, subselector in enumerate(item_selector):
            activity = GECnBlogUserActivity()
            title_selector = subselector.xpath('div/div[@class="feed_title"]')
            name = list_first_item(
                title_selector.xpath('string(a[1]/text())').extract()).strip()
            type = list_first_item(
                title_selector.xpath('text()').extract()).strip()[:-1]
            event = list_first_item(
                title_selector.xpath('string(a[2]/text())').extract()).strip()
            event_url = list_first_item(
                title_selector.xpath('string(a[2]/@href)').extract()).strip()
            activity['name'], activity['event_url'] = name, event_url
            activity['activity_id'] = get_linkmd5id(activity['event_url'])
            if type == '评论博客' or type == '发表博客':
                time = list_first_item(
                    title_selector.xpath('span/text()').extract()).strip()
                activity['type'], activity['event'], activity[
                    'time'] = type, event, time
                desc = list_first_item(
                    subselector.xpath('div/div[@class="feed_desc"]/text()').
                    extract()).strip()
                activity["desc"] = desc
            else:
                activity['type'], activity['event'], activity[
                    'time'], activity['desc'] = "话题", event, type, event
            logging.info(
                'GECnBlogPopularUserSpider: acitity\'s info %s is crawled successfully',
                name)
            yield activity

        next_selector = selector.xpath('//div[@class="block_arrow"]/a')
        pager_selector = selector.xpath(
            '//div[@class="block_arrow"]/div[@class="pager"]')
        if list_first_item(next_selector.extract()) is not None:
            nexturl = list_first_item(
                next_selector.xpath('@href').extract()).strip()
            nexturl = CNBLOG_USER_HOME_URL + nexturl
            yield Request(url=nexturl,
                          callback=self.prase_activity,
                          headers=CNBLOG_MAIN_POST_HEADERS,
                          cookies=CNBOLG_COOKIE)
        elif list_first_item(pager_selector.extract()) is not None:
            next_page_href = str(pager_selector.xpath('a/@href').extract()[-1])
            next_page_text = pager_selector.xpath(
                'a/text()').extract()[-1][:-2]
            if next_page_text == 'Next':
                next_link = CNBLOG_USER_HOME_URL + next_page_href
                yield Request(url=next_link,
                              callback=self.prase_activity,
                              cookies=CNBOLG_COOKIE,
                              headers=CNBLOG_MAIN_POST_HEADERS)
예제 #3
0
    def parse(self, response):
        selector = Selector(response).css('td')
        for i, subselector in enumerate(selector):
            if i == 0:
                continue
            user = GECnBlogUser()
            ranking = list_first_item(
                subselector.xpath('small[1]/text()').extract()).strip()[:-1]
            content = list_first_item(
                subselector.xpath(
                    'small[2]/text()').extract()).strip()[1:-1].split(',')
            post_num, last_post_time, score = content[0].strip(
            ), content[1].strip(), content[2].strip()
            link = list_first_item(
                subselector.xpath("string(a[1]/@href)").extract()).strip()
            name = list_first_item(
                subselector.xpath("string(a[1]/text())").extract()).strip()
            rss_url = list_first_item(
                subselector.xpath("string(a[2]/@href)").extract()).strip()
            user['nickname'], user['link'], user['ranking'], user[
                'score'], user['rss_url'] = name, link, int(ranking), int(
                    score), rss_url
            user['post_num'], user['last_post_time'] = int(
                post_num), last_post_time
            user['user_id'] = get_linkmd5id(user['link'])
            self.user_urls.append(user['link'])
            logging.info(
                'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully',
                name)
            yield user

        for link in self.user_urls:
            # 爬用户的博客
            if link is not '':
                yield Request(url=link,
                              callback=self.parse_user,
                              headers=CNBLOG_MAIN_POST_HEADERS)
            # 爬用户的个人信息
            if link.split('/')[-2] != '':
                next_link = (link.split('/')[-2]).encode(response.encoding)
                detail_url = clean_url(CNBLOG_USER_HOME_URL + '/u/', next_link,
                                       response.encoding)
                yield Request(url=detail_url,
                              callback=self.parse_user_detail,
                              headers=CNBLOG_MAIN_POST_HEADERS,
                              cookies=CNBOLG_COOKIE)
예제 #4
0
 def item_completed(self, results, item, info):
     if item is None:
         return
     if 'icon' in item.keys():
         image_paths = [x['path'] for ok, x in results if ok]
         if list_first_item(image_paths) is None:
             item['icon_path'] = ''
             raise DropItem("Item contains no images")
         linkmd5id = get_linkmd5id(item['link'])
         user = self.db['users'].find_one({"user_id": linkmd5id})
         if user is not None:
             icon_path = list_first_str(image_paths)
             item['icon_path'] = os.path.join(
                 os.path.abspath(self.images_store),
                 icon_path) if icon_path else ""
             self.db['users'].update_one(
                 {"user_id": linkmd5id},
                 {'$set': {
                     'icon_path': item['icon_path']
                 }}, True)
             logging.info(
                 'GECNBLOGUserCoverImage: item is updated successfully')
             # 可以将图片的路径加入到数据库中
             return item
예제 #5
0
    def parse_user_detail(self, response):
        user = GECnBlogUser()
        selector = Selector(response)
        follow_count = list_first_item(
            selector.xpath(
                '//a[@id="following_count"]/text()').extract()).strip()
        fans_count = list_first_item(
            selector.xpath(
                '//a[@id="follower_count"]/text()').extract()).strip()
        icon = list_first_item(
            selector.xpath(
                '//img[@class="img_avatar"]/@src').extract()).strip()
        nickname = list_first_item(
            selector.xpath(
                '//h1[@class="display_name"]/text()').extract()).strip()
        user['follow_num'], user['fans_num'], user['icon'], user[
            'nickname'] = follow_count, fans_count, 'https:' + icon, nickname
        li_selector = selector.xpath('//ul[@class="user_profile"]//li')
        for i, subselector in enumerate(li_selector):
            if i == 0:
                continue
            key = list_first_item(
                subselector.css('span::text').extract()).strip()[:-1]
            if key == "园龄":
                use_time = list_first_item(
                    subselector.xpath(
                        'string(//span[2]/text())').extract()).strip()
                user['use_time'] = use_time
            elif key == "博客":
                link = list_first_item(
                    subselector.xpath('a/@href').extract()).strip()
                user['link'] = link
                user['user_id'] = get_linkmd5id(user['link'])
            elif key == '姓名':
                name = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['name'] = name
            elif key == '家乡':
                hometown = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['hometown'] = hometown
            elif key == '现居住地':
                residence = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['residence'] = residence
            elif key == '座右铭':
                motto = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['motto'] = motto
            elif key == '自我介绍':
                intro = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['intro'] = intro
            elif key == '婚姻':
                marriage = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['marriage'] = marriage
            elif key == '工作状况':
                work_condition = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['work_condition'] = work_condition
            elif key == '感兴趣的技术':
                interest = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['interest'] = interest
            elif key == '最近目标':
                goal = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['goal'] = goal
            elif key == 'QQ':
                qq = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['qq'] = qq
            elif key == '职位':
                work_position = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['work_position'] = work_position
            elif key == '单位':
                work_unit = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['work_unit'] = work_unit
            elif key == '出生日期':
                birthday = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['birthday'] = birthday
        logging.info(
            'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully',
            nickname)
        yield user

        # 爬动态信息
        next_link = (user['link'].split('/')[-2] + "/feed/1.html").encode(
            response.encoding)
        activity_url = clean_url(response.url, next_link, response.encoding)
        yield Request(url=activity_url,
                      callback=self.prase_activity,
                      headers=CNBLOG_MAIN_POST_HEADERS,
                      cookies=CNBOLG_COOKIE)
예제 #6
0
 def process_item(db, item, spider):
     # 区分博主信息和文章信息和活动信息
     if 'activity_id' in item.keys():
         activity_detail = {
             'activity_id': item.get('activity_id'),
             'name': item.get('name'),
             'type': item.get('type'),
             'event': item.get('event'),
             'event_url': item.get('event_url'),
             'desc': item.get('desc'),
             'time': item.get('time'),
         }
         result = db['activities'].insert(activity_detail)
         item['mongodb_id'] = str(result)
         logging.info('GECnBlogUserPipeline: item is added successfully')
         return item
     elif 'post_id' in item.keys():
         post_detail = {
             'post_id': item.get('post_id'),
             'title': item.get('title'),
             'post_link': item.get('post_link'),
             'username': item.get('username'),
             'user_url': item.get('user_url'),
             'brief': item.get('brief'),
             'time': item.get('time'),
             'view_num': item.get('view_num'),
             'comment_num': item.get('comment_num')
         }
         result = db['user_post'].insert(post_detail)
         item['mongodb_id'] = str(result)
         logging.info('GECnBlogUserPipeline: item is added successfully')
         return item
     elif 'user_id' in item.keys():  # 默认是用户信息
         linkmd5id = get_linkmd5id(item['link'])
         user = db['users'].find_one({"user_id": linkmd5id})
         if user is None:
             user_detail = {
                 'user_id': item.get('user_id'),
                 'nickname': item.get('nickname'),
                 'name': item.get('name', ''),
                 'link': item.get('link'),
                 'icon': item.get('icon', ''),
                 'sex': item.get('sex', ''),
                 'birthday': item.get('birthday', ''),
                 'ranking': item.get('ranking', 0),
                 'score': item.get('score', 0),
                 'rss_url': item.get('rss_url', ''),
                 'post_num': item.get('post_num', 0),
                 'last_post_time': item.get('last_post_time', ''),
                 'hometown': item.get('hometown', ''),
                 'residence': item.get('residence', ''),
                 'work_condition': item.get('work_condition', ''),
                 'work_position': item.get('work_position', ''),
                 'work_unit': item.get('work_unit', ''),
                 'marriage': item.get('marriage', ''),
                 'interest': item.get('interest', ''),
                 'goal': item.get('goal', ''),
                 'motto': item.get('motto', ''),
                 'intro': item.get('intro', ''),
                 'qq': item.get('qq', ''),
                 'use_time': item.get('use_time', ''),
                 'follow_num': item.get('follow_num', 0),
                 'fans_num': item.get('fans_num', 0)
             }
             result = db['users'].insert_one(user_detail)
             item['mongodb_id'] = str(result)  # 并不需要将其存到数据库中
             logging.info(
                 'GECnBlogUserPipeline: item is added successfully')
             return item
         else:
             for key in item.keys():
                 if item[key] != user[key] and item[key] != 0 and item[
                         key] != '' and key != 'mongodb_id':
                     db['users'].update_one({"user_id": linkmd5id},
                                            {'$set': {
                                                key: item[key]
                                            }})
             logging.info(
                 'GECnBlogUserPipeline: item is updated successfully')
             return db['users'].find_one({"user_id": linkmd5id})