def parse_user(self, response): selector = Selector(response) post_selector = selector.xpath('//div[@class="day"]') for i, subselector in enumerate(post_selector): if i == 0: continue post = GECnUserBlogPost() title = list_first_str( subselector.xpath( 'div[@class="postTitle"]/a/text()').extract()) post_link = list_first_str( subselector.xpath('div[@class="postTitle"]/a/@href').extract()) desc = ''.join( subselector.xpath( 'div[@class="postCon"]/div/text()').extract()) foot = list_first_str( subselector.xpath( 'div[@class="postDesc"]/text()').extract()).split(' ') post['title'], post['post_link'], post[ 'brief'] = title, post_link, desc post['username'], post['time'], post['view_num'], post['comment_num'] \ = foot[4], foot[2]+' ' + foot[3], foot[5].strip()[3:-1], foot[6].strip()[3:-1] post['user_url'] = response.url post['post_id'] = get_linkmd5id(post['post_link']) logging.info( 'GECnBlogPopularUserSpider: post\'s info %s is crawled successfully', title) yield post
def prase_activity(self, response): selector = Selector(response) item_selector = selector.xpath('//ul[@id="feed_list"]').css( 'li.feed_item') for i, subselector in enumerate(item_selector): activity = GECnBlogUserActivity() title_selector = subselector.xpath('div/div[@class="feed_title"]') name = list_first_item( title_selector.xpath('string(a[1]/text())').extract()).strip() type = list_first_item( title_selector.xpath('text()').extract()).strip()[:-1] event = list_first_item( title_selector.xpath('string(a[2]/text())').extract()).strip() event_url = list_first_item( title_selector.xpath('string(a[2]/@href)').extract()).strip() activity['name'], activity['event_url'] = name, event_url activity['activity_id'] = get_linkmd5id(activity['event_url']) if type == '评论博客' or type == '发表博客': time = list_first_item( title_selector.xpath('span/text()').extract()).strip() activity['type'], activity['event'], activity[ 'time'] = type, event, time desc = list_first_item( subselector.xpath('div/div[@class="feed_desc"]/text()'). extract()).strip() activity["desc"] = desc else: activity['type'], activity['event'], activity[ 'time'], activity['desc'] = "话题", event, type, event logging.info( 'GECnBlogPopularUserSpider: acitity\'s info %s is crawled successfully', name) yield activity next_selector = selector.xpath('//div[@class="block_arrow"]/a') pager_selector = selector.xpath( '//div[@class="block_arrow"]/div[@class="pager"]') if list_first_item(next_selector.extract()) is not None: nexturl = list_first_item( next_selector.xpath('@href').extract()).strip() nexturl = CNBLOG_USER_HOME_URL + nexturl yield Request(url=nexturl, callback=self.prase_activity, headers=CNBLOG_MAIN_POST_HEADERS, cookies=CNBOLG_COOKIE) elif list_first_item(pager_selector.extract()) is not None: next_page_href = str(pager_selector.xpath('a/@href').extract()[-1]) next_page_text = pager_selector.xpath( 'a/text()').extract()[-1][:-2] if next_page_text == 'Next': next_link = CNBLOG_USER_HOME_URL + next_page_href yield Request(url=next_link, callback=self.prase_activity, cookies=CNBOLG_COOKIE, headers=CNBLOG_MAIN_POST_HEADERS)
def parse(self, response): selector = Selector(response).css('td') for i, subselector in enumerate(selector): if i == 0: continue user = GECnBlogUser() ranking = list_first_item( subselector.xpath('small[1]/text()').extract()).strip()[:-1] content = list_first_item( subselector.xpath( 'small[2]/text()').extract()).strip()[1:-1].split(',') post_num, last_post_time, score = content[0].strip( ), content[1].strip(), content[2].strip() link = list_first_item( subselector.xpath("string(a[1]/@href)").extract()).strip() name = list_first_item( subselector.xpath("string(a[1]/text())").extract()).strip() rss_url = list_first_item( subselector.xpath("string(a[2]/@href)").extract()).strip() user['nickname'], user['link'], user['ranking'], user[ 'score'], user['rss_url'] = name, link, int(ranking), int( score), rss_url user['post_num'], user['last_post_time'] = int( post_num), last_post_time user['user_id'] = get_linkmd5id(user['link']) self.user_urls.append(user['link']) logging.info( 'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully', name) yield user for link in self.user_urls: # 爬用户的博客 if link is not '': yield Request(url=link, callback=self.parse_user, headers=CNBLOG_MAIN_POST_HEADERS) # 爬用户的个人信息 if link.split('/')[-2] != '': next_link = (link.split('/')[-2]).encode(response.encoding) detail_url = clean_url(CNBLOG_USER_HOME_URL + '/u/', next_link, response.encoding) yield Request(url=detail_url, callback=self.parse_user_detail, headers=CNBLOG_MAIN_POST_HEADERS, cookies=CNBOLG_COOKIE)
def item_completed(self, results, item, info): if item is None: return if 'icon' in item.keys(): image_paths = [x['path'] for ok, x in results if ok] if list_first_item(image_paths) is None: item['icon_path'] = '' raise DropItem("Item contains no images") linkmd5id = get_linkmd5id(item['link']) user = self.db['users'].find_one({"user_id": linkmd5id}) if user is not None: icon_path = list_first_str(image_paths) item['icon_path'] = os.path.join( os.path.abspath(self.images_store), icon_path) if icon_path else "" self.db['users'].update_one( {"user_id": linkmd5id}, {'$set': { 'icon_path': item['icon_path'] }}, True) logging.info( 'GECNBLOGUserCoverImage: item is updated successfully') # 可以将图片的路径加入到数据库中 return item
def parse_user_detail(self, response): user = GECnBlogUser() selector = Selector(response) follow_count = list_first_item( selector.xpath( '//a[@id="following_count"]/text()').extract()).strip() fans_count = list_first_item( selector.xpath( '//a[@id="follower_count"]/text()').extract()).strip() icon = list_first_item( selector.xpath( '//img[@class="img_avatar"]/@src').extract()).strip() nickname = list_first_item( selector.xpath( '//h1[@class="display_name"]/text()').extract()).strip() user['follow_num'], user['fans_num'], user['icon'], user[ 'nickname'] = follow_count, fans_count, 'https:' + icon, nickname li_selector = selector.xpath('//ul[@class="user_profile"]//li') for i, subselector in enumerate(li_selector): if i == 0: continue key = list_first_item( subselector.css('span::text').extract()).strip()[:-1] if key == "园龄": use_time = list_first_item( subselector.xpath( 'string(//span[2]/text())').extract()).strip() user['use_time'] = use_time elif key == "博客": link = list_first_item( subselector.xpath('a/@href').extract()).strip() user['link'] = link user['user_id'] = get_linkmd5id(user['link']) elif key == '姓名': name = list_first_item( subselector.xpath('text()').extract()).strip() user['name'] = name elif key == '家乡': hometown = list_first_item( subselector.xpath('text()').extract()).strip() user['hometown'] = hometown elif key == '现居住地': residence = list_first_item( subselector.xpath('text()').extract()).strip() user['residence'] = residence elif key == '座右铭': motto = list_first_item( subselector.xpath('text()').extract()).strip() user['motto'] = motto elif key == '自我介绍': intro = list_first_item( subselector.xpath('text()').extract()).strip() user['intro'] = intro elif key == '婚姻': marriage = list_first_item( subselector.xpath('text()').extract()).strip() user['marriage'] = marriage elif key == '工作状况': work_condition = list_first_item( subselector.xpath('text()').extract()).strip() user['work_condition'] = work_condition elif key == '感兴趣的技术': interest = list_first_item( subselector.xpath('text()').extract()).strip() user['interest'] = interest elif key == '最近目标': goal = list_first_item( subselector.xpath('text()').extract()).strip() user['goal'] = goal elif key == 'QQ': qq = list_first_item( subselector.xpath('text()').extract()).strip() user['qq'] = qq elif key == '职位': work_position = list_first_item( subselector.xpath('text()').extract()).strip() user['work_position'] = work_position elif key == '单位': work_unit = list_first_item( subselector.xpath('text()').extract()).strip() user['work_unit'] = work_unit elif key == '出生日期': birthday = list_first_item( subselector.xpath('text()').extract()).strip() user['birthday'] = birthday logging.info( 'GECnBlogPopularUserSpider: user\'s info %s is crawled successfully', nickname) yield user # 爬动态信息 next_link = (user['link'].split('/')[-2] + "/feed/1.html").encode( response.encoding) activity_url = clean_url(response.url, next_link, response.encoding) yield Request(url=activity_url, callback=self.prase_activity, headers=CNBLOG_MAIN_POST_HEADERS, cookies=CNBOLG_COOKIE)
def process_item(db, item, spider): # 区分博主信息和文章信息和活动信息 if 'activity_id' in item.keys(): activity_detail = { 'activity_id': item.get('activity_id'), 'name': item.get('name'), 'type': item.get('type'), 'event': item.get('event'), 'event_url': item.get('event_url'), 'desc': item.get('desc'), 'time': item.get('time'), } result = db['activities'].insert(activity_detail) item['mongodb_id'] = str(result) logging.info('GECnBlogUserPipeline: item is added successfully') return item elif 'post_id' in item.keys(): post_detail = { 'post_id': item.get('post_id'), 'title': item.get('title'), 'post_link': item.get('post_link'), 'username': item.get('username'), 'user_url': item.get('user_url'), 'brief': item.get('brief'), 'time': item.get('time'), 'view_num': item.get('view_num'), 'comment_num': item.get('comment_num') } result = db['user_post'].insert(post_detail) item['mongodb_id'] = str(result) logging.info('GECnBlogUserPipeline: item is added successfully') return item elif 'user_id' in item.keys(): # 默认是用户信息 linkmd5id = get_linkmd5id(item['link']) user = db['users'].find_one({"user_id": linkmd5id}) if user is None: user_detail = { 'user_id': item.get('user_id'), 'nickname': item.get('nickname'), 'name': item.get('name', ''), 'link': item.get('link'), 'icon': item.get('icon', ''), 'sex': item.get('sex', ''), 'birthday': item.get('birthday', ''), 'ranking': item.get('ranking', 0), 'score': item.get('score', 0), 'rss_url': item.get('rss_url', ''), 'post_num': item.get('post_num', 0), 'last_post_time': item.get('last_post_time', ''), 'hometown': item.get('hometown', ''), 'residence': item.get('residence', ''), 'work_condition': item.get('work_condition', ''), 'work_position': item.get('work_position', ''), 'work_unit': item.get('work_unit', ''), 'marriage': item.get('marriage', ''), 'interest': item.get('interest', ''), 'goal': item.get('goal', ''), 'motto': item.get('motto', ''), 'intro': item.get('intro', ''), 'qq': item.get('qq', ''), 'use_time': item.get('use_time', ''), 'follow_num': item.get('follow_num', 0), 'fans_num': item.get('fans_num', 0) } result = db['users'].insert_one(user_detail) item['mongodb_id'] = str(result) # 并不需要将其存到数据库中 logging.info( 'GECnBlogUserPipeline: item is added successfully') return item else: for key in item.keys(): if item[key] != user[key] and item[key] != 0 and item[ key] != '' and key != 'mongodb_id': db['users'].update_one({"user_id": linkmd5id}, {'$set': { key: item[key] }}) logging.info( 'GECnBlogUserPipeline: item is updated successfully') return db['users'].find_one({"user_id": linkmd5id})