Example #1
0
    def prase_activity(self, response):
        selector = Selector(response)
        item_selector = selector.xpath('//ul[@id="feed_list"]').css(
            'li.feed_item')
        for i, subselector in enumerate(item_selector):
            activity = GECnBlogUserActivity()
            title_selector = subselector.xpath('div/div[@class="feed_title"]')
            name = list_first_item(
                title_selector.xpath('string(a[1]/text())').extract()).strip()
            type = list_first_item(
                title_selector.xpath('text()').extract()).strip()[:-1]
            event = list_first_item(
                title_selector.xpath('string(a[2]/text())').extract()).strip()
            event_url = list_first_item(
                title_selector.xpath('string(a[2]/@href)').extract()).strip()
            activity['name'], activity['event_url'] = name, event_url
            activity['activity_id'] = get_linkmd5id(activity['event_url'])
            if type == '评论博客' or type == '发表博客':
                time = list_first_item(
                    title_selector.xpath('span/text()').extract()).strip()
                activity['type'], activity['event'], activity[
                    'time'] = type, event, time
                desc = list_first_item(
                    subselector.xpath('div/div[@class="feed_desc"]/text()').
                    extract()).strip()
                activity["desc"] = desc
            else:
                activity['type'], activity['event'], activity[
                    'time'], activity['desc'] = "话题", event, type, event
            logging.info(
                'GECnBlogPopularUserSpider: acitity\'s info %s is crawled successfully',
                name)
            yield activity

        next_selector = selector.xpath('//div[@class="block_arrow"]/a')
        pager_selector = selector.xpath(
            '//div[@class="block_arrow"]/div[@class="pager"]')
        if list_first_item(next_selector.extract()) is not None:
            nexturl = list_first_item(
                next_selector.xpath('@href').extract()).strip()
            nexturl = CNBLOG_USER_HOME_URL + nexturl
            yield Request(url=nexturl,
                          callback=self.prase_activity,
                          headers=CNBLOG_MAIN_POST_HEADERS,
                          cookies=CNBOLG_COOKIE)
        elif list_first_item(pager_selector.extract()) is not None:
            next_page_href = str(pager_selector.xpath('a/@href').extract()[-1])
            next_page_text = pager_selector.xpath(
                'a/text()').extract()[-1][:-2]
            if next_page_text == 'Next':
                next_link = CNBLOG_USER_HOME_URL + next_page_href
                yield Request(url=next_link,
                              callback=self.prase_activity,
                              cookies=CNBOLG_COOKIE,
                              headers=CNBLOG_MAIN_POST_HEADERS)
Example #2
0
    def parse(self, response):
        selector = Selector(response).css('td')
        for i, subselector in enumerate(selector):
            if i == 0:
                continue
            user = GECnBlogUser()
            ranking = list_first_item(
                subselector.xpath('small[1]/text()').extract()).strip()[:-1]
            content = list_first_item(
                subselector.xpath(
                    'small[2]/text()').extract()).strip()[1:-1].split(',')
            post_num, last_post_time, score = content[0].strip(
            ), content[1].strip(), content[2].strip()
            link = list_first_item(
                subselector.xpath("string(a[1]/@href)").extract()).strip()
            name = list_first_item(
                subselector.xpath("string(a[1]/text())").extract()).strip()
            rss_url = list_first_item(
                subselector.xpath("string(a[2]/@href)").extract()).strip()
            user['nickname'], user['link'], user['ranking'], user[
                'score'], user['rss_url'] = name, link, int(ranking), int(
                    score), rss_url
            user['post_num'], user['last_post_time'] = int(
                post_num), last_post_time
            user['user_id'] = get_linkmd5id(user['link'])
            self.user_urls.append(user['link'])
            logging.info(
                'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully',
                name)
            yield user

        for link in self.user_urls:
            # 爬用户的博客
            if link is not '':
                yield Request(url=link,
                              callback=self.parse_user,
                              headers=CNBLOG_MAIN_POST_HEADERS)
            # 爬用户的个人信息
            if link.split('/')[-2] != '':
                next_link = (link.split('/')[-2]).encode(response.encoding)
                detail_url = clean_url(CNBLOG_USER_HOME_URL + '/u/', next_link,
                                       response.encoding)
                yield Request(url=detail_url,
                              callback=self.parse_user_detail,
                              headers=CNBLOG_MAIN_POST_HEADERS,
                              cookies=CNBOLG_COOKIE)
Example #3
0
    def parse(self, response):
        self.log('Hi, this is an item page! %s' % response.url)
        selector = Selector(response)
        for subselector in selector.xpath('//div[@class="post_item"]'):
            post = GECnMainBlogPost()
            post["recommend_num"] = list_first_item(subselector.css('span.diggnum').xpath('text()').extract())
            post["title"] = list_first_item(subselector.css('a.titlelnk').xpath('text()').extract())
            post["post_link"] = list_first_item(subselector.css('a.titlelnk').xpath('@href').extract())
            summary_content = subselector.css('p.post_item_summary').xpath('text()').extract()
            if len(summary_content) > 1:
                summary = summary_content[1]
            else :
                summary = summary_content[0]
            post["brief"] = summary.strip()[:-4]
            footer = subselector.css('div.post_item_foot')
            post["username"] = list_first_item(footer.css('a.lightblue').xpath('text()').extract())
            post["user_link"] = list_first_item(footer.css('a.lightblue').xpath('@href').extract())
            post["time"] = footer.xpath('text()').extract()[1].strip()[4:]
            post["comment_num"] = int(list_first_item(footer.css('span.article_comment a').xpath('text()').extract()).strip()[3:-1])
            post["view_num"] = int(list_first_item(footer.css('span.article_view a').xpath('text()').extract()).strip()[3:-1])
            # if post["post_link"]:
            #     yield Request(url=post["post_link"], callback=self.parse_detail)
            yield post

        page_selector = selector.xpath('//div[@id="pager_bottom"]/div[@id="paging_block"]/div[@class="pager"]')
        next_page_href = str(page_selector.xpath('a/@href').extract()[-1].split('/')[-1])
        next_page_text = page_selector.xpath('a/text()').extract()[-1][:-2]

        if next_page_text == 'Next':
            next_link = ('\?CategoryId=808&CategoryType=%22SiteHome%22&ItemListActionName=%22PostList%22' \
                         'PageIndex=' + next_page_href + '&ParentCategoryId=0').encode(response.encoding)
            next_link = clean_url(response.url, next_link, response.encoding)
            yield Request(url=next_link, callback=self.parse, cookies=CNBOLG_COOKIE, headers=CNBLOG_MAIN_POST_HEADERS,
                          body=json.dumps(getPageList(CNBLOG_MAIN_POST_PAYLOAD, next_page_href)))
    def parse_follower_item(self, response):
        print("follower")
        selector = Selector(response)
        for subselector in selector:
            print(subselector.xpath('//div[@class="avatar_name"]'))
            print(subselector.extract())

            user_url = list_first_item(
                subselector.css('div.avatar_name a').xpath('@href'))
            if user_url:
                user_url = user_url.encode(response.encoding)
                user_url = clean_url(response.url, user_url, response.encoding)
                print(user_url)
Example #5
0
    def parse(self, response):
        self.log('Hi, this is an item page! %s' % response.url)
        selector = Selector(response)
        for subselector in selector.xpath('//div[@class="one_entity"]'):
            question = GECnBlogQuestion()
            question["reply_num"] = list_first_int(
                subselector.xpath('div[1]/div/div[1]/text()').extract())
            item_selector = subselector.xpath('div[2]')
            if list_first_item(
                    item_selector.xpath('h2/span').extract()) is not None:
                question["score"] = list_first_int(
                    item_selector.xpath('h2/span/text()').extract())
            else:
                question["score"] = 0
            question["title"] = list_first_str(
                item_selector.xpath('h2/a/text()').extract())
            question["title_link"] = CNBLOG_QUESTION_URL + \
                                 list_first_str(item_selector.xpath('h2/a/@href').extract())
            question["desc"] = list_first_str(
                item_selector.xpath(
                    'div[@class="news_summary"]/text()').extract())
            item_footer_selector = item_selector.xpath(
                'div[@class="news_footer"]')
            question["username"] = list_first_str(
                item_footer_selector.xpath('div[2]/a[2]/text()').extract())
            question["view_num"] = list_first_int(
                item_footer_selector.xpath('div[2]/text()').extract()
                [1].strip()[3:-1])
            question["time"] = list_first_str(
                item_footer_selector.xpath('div[2]/span/text()').extract())
            tag_str = ''
            for i, tag_selector in enumerate(
                    item_footer_selector.xpath('div[1]/a')):
                tag_str += list_first_str(
                    tag_selector.xpath('text()').extract()) + '|'
            question['tag'] = tag_str
            yield question

        page_selector = selector.xpath('//div[@id="pager"]')
        next_page_href = page_selector.xpath('a/@href').extract()[-1].strip()
        next_page_text = page_selector.xpath(
            'a/text()').extract()[-1].strip()[:-2]
        if next_page_text == 'Next':
            next_url = CNBLOG_QUESTION_URL + next_page_href
            yield Request(url=next_url, callback=self.parse)
Example #6
0
 def parse_detail(self, response):
     response_selector = Selector(response)
     yield list_first_item(response_selector.xpath(u'//div[@id="cnblogs_post_body"]').extract())
Example #7
0
    def parse_user_detail(self, response):
        user = GECnBlogUser()
        selector = Selector(response)
        follow_count = list_first_item(
            selector.xpath(
                '//a[@id="following_count"]/text()').extract()).strip()
        fans_count = list_first_item(
            selector.xpath(
                '//a[@id="follower_count"]/text()').extract()).strip()
        icon = list_first_item(
            selector.xpath(
                '//img[@class="img_avatar"]/@src').extract()).strip()
        nickname = list_first_item(
            selector.xpath(
                '//h1[@class="display_name"]/text()').extract()).strip()
        user['follow_num'], user['fans_num'], user['icon'], user[
            'nickname'] = follow_count, fans_count, 'https:' + icon, nickname
        li_selector = selector.xpath('//ul[@class="user_profile"]//li')
        for i, subselector in enumerate(li_selector):
            if i == 0:
                continue
            key = list_first_item(
                subselector.css('span::text').extract()).strip()[:-1]
            if key == "园龄":
                use_time = list_first_item(
                    subselector.xpath(
                        'string(//span[2]/text())').extract()).strip()
                user['use_time'] = use_time
            elif key == "博客":
                link = list_first_item(
                    subselector.xpath('a/@href').extract()).strip()
                user['link'] = link
                user['user_id'] = get_linkmd5id(user['link'])
            elif key == '姓名':
                name = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['name'] = name
            elif key == '家乡':
                hometown = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['hometown'] = hometown
            elif key == '现居住地':
                residence = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['residence'] = residence
            elif key == '座右铭':
                motto = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['motto'] = motto
            elif key == '自我介绍':
                intro = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['intro'] = intro
            elif key == '婚姻':
                marriage = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['marriage'] = marriage
            elif key == '工作状况':
                work_condition = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['work_condition'] = work_condition
            elif key == '感兴趣的技术':
                interest = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['interest'] = interest
            elif key == '最近目标':
                goal = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['goal'] = goal
            elif key == 'QQ':
                qq = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['qq'] = qq
            elif key == '职位':
                work_position = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['work_position'] = work_position
            elif key == '单位':
                work_unit = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['work_unit'] = work_unit
            elif key == '出生日期':
                birthday = list_first_item(
                    subselector.xpath('text()').extract()).strip()
                user['birthday'] = birthday
        logging.info(
            'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully',
            nickname)
        yield user

        # 爬动态信息
        next_link = (user['link'].split('/')[-2] + "/feed/1.html").encode(
            response.encoding)
        activity_url = clean_url(response.url, next_link, response.encoding)
        yield Request(url=activity_url,
                      callback=self.prase_activity,
                      headers=CNBLOG_MAIN_POST_HEADERS,
                      cookies=CNBOLG_COOKIE)