def prase_activity(self, response): selector = Selector(response) item_selector = selector.xpath('//ul[@id="feed_list"]').css( 'li.feed_item') for i, subselector in enumerate(item_selector): activity = GECnBlogUserActivity() title_selector = subselector.xpath('div/div[@class="feed_title"]') name = list_first_item( title_selector.xpath('string(a[1]/text())').extract()).strip() type = list_first_item( title_selector.xpath('text()').extract()).strip()[:-1] event = list_first_item( title_selector.xpath('string(a[2]/text())').extract()).strip() event_url = list_first_item( title_selector.xpath('string(a[2]/@href)').extract()).strip() activity['name'], activity['event_url'] = name, event_url activity['activity_id'] = get_linkmd5id(activity['event_url']) if type == '评论博客' or type == '发表博客': time = list_first_item( title_selector.xpath('span/text()').extract()).strip() activity['type'], activity['event'], activity[ 'time'] = type, event, time desc = list_first_item( subselector.xpath('div/div[@class="feed_desc"]/text()'). extract()).strip() activity["desc"] = desc else: activity['type'], activity['event'], activity[ 'time'], activity['desc'] = "话题", event, type, event logging.info( 'GECnBlogPopularUserSpider: acitity\'s info %s is crawled successfully', name) yield activity next_selector = selector.xpath('//div[@class="block_arrow"]/a') pager_selector = selector.xpath( '//div[@class="block_arrow"]/div[@class="pager"]') if list_first_item(next_selector.extract()) is not None: nexturl = list_first_item( next_selector.xpath('@href').extract()).strip() nexturl = CNBLOG_USER_HOME_URL + nexturl yield Request(url=nexturl, callback=self.prase_activity, headers=CNBLOG_MAIN_POST_HEADERS, cookies=CNBOLG_COOKIE) elif list_first_item(pager_selector.extract()) is not None: next_page_href = str(pager_selector.xpath('a/@href').extract()[-1]) next_page_text = pager_selector.xpath( 'a/text()').extract()[-1][:-2] if next_page_text == 'Next': next_link = CNBLOG_USER_HOME_URL + next_page_href yield Request(url=next_link, callback=self.prase_activity, cookies=CNBOLG_COOKIE, headers=CNBLOG_MAIN_POST_HEADERS)
def parse(self, response): selector = Selector(response).css('td') for i, subselector in enumerate(selector): if i == 0: continue user = GECnBlogUser() ranking = list_first_item( subselector.xpath('small[1]/text()').extract()).strip()[:-1] content = list_first_item( subselector.xpath( 'small[2]/text()').extract()).strip()[1:-1].split(',') post_num, last_post_time, score = content[0].strip( ), content[1].strip(), content[2].strip() link = list_first_item( subselector.xpath("string(a[1]/@href)").extract()).strip() name = list_first_item( subselector.xpath("string(a[1]/text())").extract()).strip() rss_url = list_first_item( subselector.xpath("string(a[2]/@href)").extract()).strip() user['nickname'], user['link'], user['ranking'], user[ 'score'], user['rss_url'] = name, link, int(ranking), int( score), rss_url user['post_num'], user['last_post_time'] = int( post_num), last_post_time user['user_id'] = get_linkmd5id(user['link']) self.user_urls.append(user['link']) logging.info( 'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully', name) yield user for link in self.user_urls: # 爬用户的博客 if link is not '': yield Request(url=link, callback=self.parse_user, headers=CNBLOG_MAIN_POST_HEADERS) # 爬用户的个人信息 if link.split('/')[-2] != '': next_link = (link.split('/')[-2]).encode(response.encoding) detail_url = clean_url(CNBLOG_USER_HOME_URL + '/u/', next_link, response.encoding) yield Request(url=detail_url, callback=self.parse_user_detail, headers=CNBLOG_MAIN_POST_HEADERS, cookies=CNBOLG_COOKIE)
def parse(self, response): self.log('Hi, this is an item page! %s' % response.url) selector = Selector(response) for subselector in selector.xpath('//div[@class="post_item"]'): post = GECnMainBlogPost() post["recommend_num"] = list_first_item(subselector.css('span.diggnum').xpath('text()').extract()) post["title"] = list_first_item(subselector.css('a.titlelnk').xpath('text()').extract()) post["post_link"] = list_first_item(subselector.css('a.titlelnk').xpath('@href').extract()) summary_content = subselector.css('p.post_item_summary').xpath('text()').extract() if len(summary_content) > 1: summary = summary_content[1] else : summary = summary_content[0] post["brief"] = summary.strip()[:-4] footer = subselector.css('div.post_item_foot') post["username"] = list_first_item(footer.css('a.lightblue').xpath('text()').extract()) post["user_link"] = list_first_item(footer.css('a.lightblue').xpath('@href').extract()) post["time"] = footer.xpath('text()').extract()[1].strip()[4:] post["comment_num"] = int(list_first_item(footer.css('span.article_comment a').xpath('text()').extract()).strip()[3:-1]) post["view_num"] = int(list_first_item(footer.css('span.article_view a').xpath('text()').extract()).strip()[3:-1]) # if post["post_link"]: # yield Request(url=post["post_link"], callback=self.parse_detail) yield post page_selector = selector.xpath('//div[@id="pager_bottom"]/div[@id="paging_block"]/div[@class="pager"]') next_page_href = str(page_selector.xpath('a/@href').extract()[-1].split('/')[-1]) next_page_text = page_selector.xpath('a/text()').extract()[-1][:-2] if next_page_text == 'Next': next_link = ('\?CategoryId=808&CategoryType=%22SiteHome%22&ItemListActionName=%22PostList%22' \ 'PageIndex=' + next_page_href + '&ParentCategoryId=0').encode(response.encoding) next_link = clean_url(response.url, next_link, response.encoding) yield Request(url=next_link, callback=self.parse, cookies=CNBOLG_COOKIE, headers=CNBLOG_MAIN_POST_HEADERS, body=json.dumps(getPageList(CNBLOG_MAIN_POST_PAYLOAD, next_page_href)))
def parse_follower_item(self, response): print("follower") selector = Selector(response) for subselector in selector: print(subselector.xpath('//div[@class="avatar_name"]')) print(subselector.extract()) user_url = list_first_item( subselector.css('div.avatar_name a').xpath('@href')) if user_url: user_url = user_url.encode(response.encoding) user_url = clean_url(response.url, user_url, response.encoding) print(user_url)
def parse(self, response): self.log('Hi, this is an item page! %s' % response.url) selector = Selector(response) for subselector in selector.xpath('//div[@class="one_entity"]'): question = GECnBlogQuestion() question["reply_num"] = list_first_int( subselector.xpath('div[1]/div/div[1]/text()').extract()) item_selector = subselector.xpath('div[2]') if list_first_item( item_selector.xpath('h2/span').extract()) is not None: question["score"] = list_first_int( item_selector.xpath('h2/span/text()').extract()) else: question["score"] = 0 question["title"] = list_first_str( item_selector.xpath('h2/a/text()').extract()) question["title_link"] = CNBLOG_QUESTION_URL + \ list_first_str(item_selector.xpath('h2/a/@href').extract()) question["desc"] = list_first_str( item_selector.xpath( 'div[@class="news_summary"]/text()').extract()) item_footer_selector = item_selector.xpath( 'div[@class="news_footer"]') question["username"] = list_first_str( item_footer_selector.xpath('div[2]/a[2]/text()').extract()) question["view_num"] = list_first_int( item_footer_selector.xpath('div[2]/text()').extract() [1].strip()[3:-1]) question["time"] = list_first_str( item_footer_selector.xpath('div[2]/span/text()').extract()) tag_str = '' for i, tag_selector in enumerate( item_footer_selector.xpath('div[1]/a')): tag_str += list_first_str( tag_selector.xpath('text()').extract()) + '|' question['tag'] = tag_str yield question page_selector = selector.xpath('//div[@id="pager"]') next_page_href = page_selector.xpath('a/@href').extract()[-1].strip() next_page_text = page_selector.xpath( 'a/text()').extract()[-1].strip()[:-2] if next_page_text == 'Next': next_url = CNBLOG_QUESTION_URL + next_page_href yield Request(url=next_url, callback=self.parse)
def parse_detail(self, response): response_selector = Selector(response) yield list_first_item(response_selector.xpath(u'//div[@id="cnblogs_post_body"]').extract())
def parse_user_detail(self, response): user = GECnBlogUser() selector = Selector(response) follow_count = list_first_item( selector.xpath( '//a[@id="following_count"]/text()').extract()).strip() fans_count = list_first_item( selector.xpath( '//a[@id="follower_count"]/text()').extract()).strip() icon = list_first_item( selector.xpath( '//img[@class="img_avatar"]/@src').extract()).strip() nickname = list_first_item( selector.xpath( '//h1[@class="display_name"]/text()').extract()).strip() user['follow_num'], user['fans_num'], user['icon'], user[ 'nickname'] = follow_count, fans_count, 'https:' + icon, nickname li_selector = selector.xpath('//ul[@class="user_profile"]//li') for i, subselector in enumerate(li_selector): if i == 0: continue key = list_first_item( subselector.css('span::text').extract()).strip()[:-1] if key == "园龄": use_time = list_first_item( subselector.xpath( 'string(//span[2]/text())').extract()).strip() user['use_time'] = use_time elif key == "博客": link = list_first_item( subselector.xpath('a/@href').extract()).strip() user['link'] = link user['user_id'] = get_linkmd5id(user['link']) elif key == '姓名': name = list_first_item( subselector.xpath('text()').extract()).strip() user['name'] = name elif key == '家乡': hometown = list_first_item( subselector.xpath('text()').extract()).strip() user['hometown'] = hometown elif key == '现居住地': residence = list_first_item( subselector.xpath('text()').extract()).strip() user['residence'] = residence elif key == '座右铭': motto = list_first_item( subselector.xpath('text()').extract()).strip() user['motto'] = motto elif key == '自我介绍': intro = list_first_item( subselector.xpath('text()').extract()).strip() user['intro'] = intro elif key == '婚姻': marriage = list_first_item( subselector.xpath('text()').extract()).strip() user['marriage'] = marriage elif key == '工作状况': work_condition = list_first_item( subselector.xpath('text()').extract()).strip() user['work_condition'] = work_condition elif key == '感兴趣的技术': interest = list_first_item( subselector.xpath('text()').extract()).strip() user['interest'] = interest elif key == '最近目标': goal = list_first_item( subselector.xpath('text()').extract()).strip() user['goal'] = goal elif key == 'QQ': qq = list_first_item( subselector.xpath('text()').extract()).strip() user['qq'] = qq elif key == '职位': work_position = list_first_item( subselector.xpath('text()').extract()).strip() user['work_position'] = work_position elif key == '单位': work_unit = list_first_item( subselector.xpath('text()').extract()).strip() user['work_unit'] = work_unit elif key == '出生日期': birthday = list_first_item( subselector.xpath('text()').extract()).strip() user['birthday'] = birthday logging.info( 'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully', nickname) yield user # 爬动态信息 next_link = (user['link'].split('/')[-2] + "/feed/1.html").encode( response.encoding) activity_url = clean_url(response.url, next_link, response.encoding) yield Request(url=activity_url, callback=self.prase_activity, headers=CNBLOG_MAIN_POST_HEADERS, cookies=CNBOLG_COOKIE)