def get_user_by_search(user, number=1, start=0): if not isinstance(user, str): raise MethodParamError('Parameter \'user\' must be an instance of \'str\'!') if not isinstance(number, int): raise MethodParamError('Parameter \'number\' must be an instance of \'int\'!') if number <= 0: number = 1 position = start response = requests.get(zhihu_search_url.format(key=quote(user), offset=position), headers=zhihu_header) result = response.json() user_tokens = [] user_htmls = [] while len(user_tokens) < number: for html in result.get('htmls'): bs = BeautifulSoup(html, 'lxml') user_tokens.append(bs.li.attrs['data-token']) user_htmls.append(bs.li) if len(result.get('htmls')) < 10: break if len(user_tokens) < number: position += 10 response = requests.get(zhihu_search_url.format(key=quote(user), offset=position), headers=zhihu_header) result = response.json() if len(user_tokens) > number: user_tokens = user_tokens[:number] user_htmls = user_htmls[:number] user_infos = [] for user_li in user_htmls: item = [] item.append(str(user_li.find('img'))) item.append(str(user_li.find('a', {'class', 'author-link'}))) item.append(str(user_li.find('span', {'class', 'bio'}))) item += [str(it) for it in user_li.find_all('a')[-3:]] user_infos.append(item) return user_tokens, user_infos
def scrape_user_follows(self, id, number=0): if not isinstance(id, int): raise MethodParamError( 'Parameter \'id\' isn\'t an instance of type \'int\'!') if not isinstance(number, int): raise MethodParamError( 'Parameter \'number\' isn\'t an instance of type \'int\'!') if log_weibo: logging.info('Scraping follows of weibo user: %d...' % id) response = requests.get(weibo_user_follow_url.format(uid=id, page=1)) result = response.json() total = result.get('count') if number <= 0: need_count = 10 else: need_count = number if number < total else total finish_count = 0 follows = [] position = 0 while finish_count < need_count: position += 1 response = requests.get( weibo_user_follow_url.format(uid=id, page=position)) result = response.json() for card in result.get('cards'): if finish_count >= need_count: break user = card.get('user') item = WeiboUserItem() item.id = user.get('id') item.profile_url = 'https://weibo.com/u/{uid}'.format( uid=item.id) item.name = user.get('screen_name') item.gender = '男' if user.get('gender') == 'm' else '女' item.avatar_url = user.get('profile_image_url') item.description = user.get('description') item.weibo_count = user.get('statuses_count') item.follow_count = user.get('follow_count') item.fans_count = user.get('followers_count') response_info = requests.get( weibo_user_info_url.format(uid1=item.id, uid2=item.id)) result_info = response_info.json() for card in result_info.get('cards'): if card.get('card_type') != 11: continue for card_inner in card.get('card_group'): if card_inner.get('card_type') != 41: continue item_name = card_inner.get('item_name') item_content = card_inner.get('item_content') if item_name == '所在地': item.location = item_content elif item_name == '注册时间': item.signup_time = item_content follows.append(item) finish_count += 1 if log_weibo: logging.info('Succeed in scraping follows of weibo user: %d.' % id) self.scraped_follows[id] = follows return follows
def scrape_user_posts(self, user, before=None, after=None, number=10): if not isinstance(user, str): raise MethodParamError( 'Parameter \'user\' isn\'t an instance of type \'str\'!') if not isinstance(number, int): raise MethodParamError( "Parameter \'number\' isn\'t an instance of type \'int\'!") before = int(time.time()) if before is None else int(before) after = 0 if after is None else int(after) if log_tieba: logging.info('Scraping posts of tieba user: %s...' % user) if number <= 0: number = 10 finish = 0 posts = [] page = 1 stop_flag = False while finish < number: # print(tieba_user_post_url.format(user=user, page=page)) while True: response = requests.get( tieba_user_post_url.format(user=user, page=page)) if response.text.startswith('<!DOCTYPE html>'): # 得到贴吧404界面 time.sleep(3) else: break result = response.json() for thread in result.get('data').get('thread_list'): if finish >= number: break item = TiebaPostItem() item.time = int(thread.get('create_time')) if item.time > before: continue if item.time < after: stop_flag = True break item.title = thread.get('title') if re.match(r'^回复:', item.title): item.title = item.title[3:] item.title_url = 'https://tieba.baidu.com/p/{tid}'.format( tid=thread.get('thread_id')) item.content = thread.get('content') item.content_url = 'http://tieba.baidu.com/p/{tid}?pid={pid}&cid=#{cid}'.format( tid=thread.get('thread_id'), pid=thread.get('post_id'), cid=thread.get('post_id')) item.forum = thread.get('forum_name') item.forum_url = 'http://tieba.baidu.com/f?kw={kw}'.format( kw=quote(item.forum)) posts.append(item) finish += 1 page += 1 if finish >= number or not result.get('data').get( 'has_more') or stop_flag: break if log_tieba: logging.info('Succeed in scraping posts of tieba user: %s.' % user) return posts
def scrape_answers_by_user(self, user, number=0): if not isinstance(user, str): raise MethodParamError( 'Parameter \'user\' isn\'t an instance of type \'str\'!') if not isinstance(number, int): raise MethodParamError( 'Parameter \'number\' isn\'t an instance of type \'int\'!') if log_zhihu: logging.info('Scraping answers of zhihu user: %s...' % user) response = requests.get(zhihu_user_answers_url.format(user=user, offset=0, limit=20), headers=zhihu_header) if response.status_code == 404: # 用户不存在或账号被封禁 if log_zhihu: logging.warning( '404 error. The user doesn\'t exist or has been blocked.') return [] result = response.json() total = result.get('paging').get('totals') if number <= 0: need_count = 10 else: need_count = number if number < total else total finish_count = 0 answer_ids = [] for data in result.get('data'): if finish_count >= need_count: break answer_ids.append(data.get('id')) finish_count += 1 if finish_count < need_count: position = 0 while not result.get('paging').get('is_end'): if finish_count >= need_count: break position += 20 next_page = zhihu_user_answers_url.format(user=user, offset=position, limit=20) result = requests.get(next_page, headers=zhihu_header).json() for data in result.get('data'): if finish_count >= need_count: break answer_ids.append(data.get('id')) finish_count += 1 answers = [] for answer_id in answer_ids: item = self.scrape_answer_by_id(id=answer_id) answers.append(item) if log_zhihu: logging.info('Succeed in scraping answers of zhihu user: %s.' % user) self.scraped_user_answers[user] = answers return answers
def scrape_user_fans(self, user, number=0): if not isinstance(user, str): raise MethodParamError( 'Parameter \'user\' isn\'t an instance of type \'str\'!') if not isinstance(number, int): raise MethodParamError( 'Parameter \'number\' isn\'t an instance of type \'int\'!') if log_zhihu: logging.info('Scraping followers of zhihu user: %s...' % user) response = requests.get(zhihu_user_followers_url.format( user=user, include=zhihu_followers_query, offset=0, limit=20), headers=zhihu_header) if response.status_code == 404: # 用户不存在或账号被封禁 if log_zhihu: logging.warning( '404 error. The user doesn\'t exist or has been blocked.') return [] result = response.json() total = result.get('paging').get('totals') if number <= 0: need_count = 10 else: need_count = number if number < total else total finish_count = 0 url_tokens = [] for data in result.get('data'): if finish_count >= need_count: break url_tokens.append(data.get('url_token')) finish_count += 1 if finish_count < need_count: while not result.get('paging').get('is_end'): if finish_count >= need_count: break next_page = result.get('paging').get('next') result = requests.get(next_page, headers=zhihu_header).json() for data in result.get('data'): if finish_count >= need_count: break url_tokens.append(data.get('url_token')) finish_count += 1 fans = [] for url_token in url_tokens: item = self.scrape_user_info(user=url_token) fans.append(item) if log_zhihu: logging.info('Succeed in scraping followers of zhihu user: %s.' % user) self.scraped_followers[user] = fans return fans
def scrape_answers_by_question(self, id, number=0): if not isinstance(id, int): raise MethodParamError( 'Parameter \'id\' isn\'t an instance of type \'int\'!') if not isinstance(number, int): raise MethodParamError( 'Parameter \'number\' isn\'t an instance of type \'int\'!') if log_zhihu: logging.info('Scraping answers of question: %d...' % id) response = requests.get(zhihu_question_answers_url.format(id=id, offset=0, limit=20), headers=zhihu_header) if response.status_code == 404: # 问题不存在 if log_zhihu: logging.warning('404 error. The question doesn\'t exist.') return [] result = response.json() total = result.get('paging').get('totals') if number <= 0: need_count = 10 else: need_count = number if number < total else total finish_count = 0 answer_ids = [] for data in result.get('data'): if finish_count >= need_count: break answer_ids.append(data.get('id')) finish_count += 1 if finish_count < need_count: while not result.get('paging').get('is_end'): if finish_count >= need_count: break next_page = result.get('paging').get('next') result = requests.get(next_page, headers=zhihu_header).json() for data in result.get('data'): if finish_count >= need_count: break answer_ids.append(data.get('id')) finish_count += 1 answers = [] for answer_id in answer_ids: item = self.scrape_answer_by_id(id=answer_id) answers.append(item) if log_zhihu: logging.info('Succeed in scraping answers of question: %d.' % id) self.scraped_question_answers[id] = answers return answers
def get_user_by_homepage(url): driver_path = 'phantomjs.exe' if not DEBUG: driver_path = settings.BASE_DIR + '/closends/spider/phantomjs.exe' driver = webdriver.PhantomJS(executable_path=driver_path, service_log_path=os.path.devnull) if not isinstance(url, str): driver.close() raise MethodParamError( 'Parameter \'url\' must be an instance of \'str\'!') driver.get(url) wait = WebDriverWait(driver, 10) try: wait.until( ec.visibility_of_element_located((By.CLASS_NAME, 'username'))) except TimeoutException: # 网速太慢或链接错误 driver.close() return [], [] username = driver.find_element_by_class_name('username').text driver.close() user_ids, user_htmls = get_user_by_account(user=username, number=1) if len(user_ids) > 0 and len(user_htmls) > 0: return user_ids[0], user_htmls[0] return [], []
def scrape_answer_by_id(self, id): if not isinstance(id, int): raise MethodParamError( 'Parameter \'id\' isn\'t an instance of type \'int\'!') if log_zhihu: logging.info('Scraping answer of id: %d...' % id) response = requests.get(zhihu_answer_url.format( id=id, include=zhihu_answer_query), headers=zhihu_header) if response.status_code == 404: if log_zhihu: logging.warning('404 error. The answer doesn\'t exist.') return None result = response.json() item = ZhihuAnswerItem() item.id = result.get('id') item.author = result.get('author').get('name') item.question_id = result.get('question').get('id') item.create_time = result.get('created_time') item.update_time = result.get('updated_time') page = requests.get('https://www.zhihu.com/question/%d/answer/%d' % (item.question_id, id), headers=zhihu_header) bs = BeautifulSoup(page.text, 'lxml') content_span = bs.find('div', {'class': 'RichContent'}).div.span content = re.search(r'<span.*?>(.*)</span>', str(content_span)).group(1) item.content = content item.voteup_count = result.get('voteup_count') item.comment_count = result.get('comment_count') if log_zhihu: logging.info('Succeed in scraping answer of id: %d.' % id) self.scraped_answers[id] = item return item
def get_user_by_account(user=None, number=1): driver_path = 'phantomjs.exe' if not DEBUG: driver_path = settings.BASE_DIR + '/closends/spider/phantomjs.exe' driver = webdriver.PhantomJS(executable_path=driver_path, service_log_path=os.path.devnull) if not isinstance(user, str): driver.close() raise MethodParamError( 'Parameter \'user\' must be an instance of \'str\'!') if not isinstance(number, int): driver.close() raise MethodParamError( 'Parameter \'number\' must be an instance of \'int\'!') if number <= 0: number = 1 wait = WebDriverWait(driver, 3) driver.get(weibo_search_url.format(user=quote(user))) try: wait.until( ec.visibility_of_element_located((By.CLASS_NAME, 'pl_personlist'))) user_divs = driver.find_elements_by_class_name('list_person') except TimeoutException: # 未找到结果或网速太慢 driver.close() return [], [] except NoSuchElementException: # 未找到结果 driver.close() return [], [] if len(user_divs) >= number: # 截取前number个搜索结果 user_divs = user_divs[:number] user_ids = [] user_htmls = [] for user_div in user_divs: user_id = user_div.find_element_by_class_name( 'person_name').find_element_by_tag_name('a').get_attribute('uid') user_ids.append(int(user_id)) user_htmls.append(user_div.get_attribute('outerHTML')) driver.close() return user_ids, user_htmls
def scrape_user_info(self, id): if not isinstance(id, int): raise MethodParamError( 'Parameter \'id\' isn\'t an instance of type \'int\'!') if log_weibo: logging.info('Scraping info of weibo user: %d...' % id) item = WeiboUserItem() item.id = id item.profile_url = 'https://weibo.com/u/{uid}'.format(uid=id) # 通过主页请求获取关注数、粉丝数、头像url response = requests.get(weibo_user_profile_url.format(uid1=id, uid2=id)) result = response.json() item.follow_count = result.get('userInfo').get('follow_count') item.fans_count = result.get('userInfo').get('followers_count') item.avatar_url = result.get('userInfo').get('profile_image_url') # 通过详细资料请求获取详细资料 response = requests.get(weibo_user_info_url.format(uid1=id, uid2=id)) result = response.json() for card in result.get('cards'): if card.get('card_type') != 11: continue for card_inner in card.get('card_group'): if card_inner.get('card_type') != 41: continue item_name = card_inner.get('item_name') item_content = card_inner.get('item_content') if item_name == '昵称': item.name = item_content elif item_name == '性别': item.gender = item_content elif item_name == '所在地': item.location = item_content elif item_name == '简介': item.description = item_content elif item_name == '注册时间': item.signup_time = item_content # 通过用户微博请求获取用户微博数 response = requests.get( weibo_user_weibo_url.format(uid1=id, uid2=id, page=1)) result = response.json() item.weibo_count = result.get('cardlistInfo').get('total') if log_weibo: logging.info('Succeed in scraping info of weibo user: %d.' % id) self.scraped_infos[id] = item return item
def scrape_user_info(self, user): if not isinstance(user, str): raise MethodParamError( 'Parameter \'user\' isn\'t an instance of type \'str\'!') if log_tieba: logging.info('Scraping info of tieba user: %s...' % user) response = requests.get( tieba_user_profile_url.format(user=quote(user))) bs = BeautifulSoup(response.text, 'lxml') item = TiebaUserItem() item.name = user if bs.find('span', {'class': 'userinfo_sex_male'}) is not None: item.sex = 'male' else: item.sex = 'female' age = bs.find('span', { 'class': 'user_name' }).find_all('span')[2].get_text() item.tieba_age = float(re.search(r'吧龄:(.*)年', age).group(1)) item.avatar_url = bs.find('a', { 'class': 'userinfo_head' }).img.attrs['src'] item.follow_count = int( bs.find_all('span', {'class': 'concern_num'})[0].find('a').get_text()) item.fans_count = int( bs.find_all('span', {'class': 'concern_num'})[1].find('a').get_text()) forum_div1 = bs.find('div', {'id': 'forum_group_wrap'}) forum_div2 = bs.find('div', {'class': 'j_panel_content'}) # 关注的吧需要展开才能显示完全 if forum_div1 is not None: forum_items1 = forum_div1.find_all('a', {'class': 'unsign'}) item.forum_count += len(forum_items1) if forum_div2 is not None: forum_items2 = forum_div2.find_all('a', {'class': 'unsign'}) item.forum_count += len(forum_items2) post = bs.find('span', { 'class': 'user_name' }).find_all('span')[4].get_text() item.post_count = int(re.search(r'发贴:(\d+)', post).group(1)) if log_tieba: logging.info('Succeed in scraping info of tieba user: %s.' % user) return item
def get_user_by_homepage(url): if not isinstance(url, str): raise MethodParamError('Parameter \'url\' must be an instance of \'str\'!') if not re.match(r'https://www\.zhihu\.com/people/.*', url): # 不合法的主页地址 return None, None user = re.search(r'https://www\.zhihu\.com/people/(.*)', url).group(1).split('/')[0] response = requests.get('https://www.zhihu.com/people/' + user + '/activities', headers=zhihu_header) if response.status_code == 404: # 用户不存在 return None, None bs = BeautifulSoup(response.text, 'lxml') user_name = bs.find('span', {'class': 'ProfileHeader-name'}).get_text() start = 0 while True: user_tokens, user_htmls = get_user_by_search(user=user_name, number=10, start=start) if len(user_tokens) == 0: break for user_token, user_html in zip(user_tokens, user_htmls): if user_token == user: return user_token, user_html start += 10 return None, None
def save_user_fans(self, user, directory='./products/'): if self.scraped_followers == {}: if log_zhihu: logging.warning( 'Haven\'t scraped followers of any zhihu user.') return infos = self.scraped_followers.get(user) if not isinstance(infos, list): raise MethodParamError( 'Haven\'t scraped followers of zhihu user: %s' % user) csv_file = open(directory + str(user) + '-followers.csv', 'w') writer = csv.writer(csv_file) writer.writerow( ('ID', '用户名', '性别', '头像链接', '行业', '一句话描述', '个人介绍', '提问数', '回答数', '文章数', '被赞同数', '被感谢数', '被收藏数', '关注数', '粉丝数', '关注话题数', '关注专栏数', '关注问题数', '关注收藏夹数', '教育经历', '职业经历', '居住地')) for info in infos: if not isinstance(info, ZhihuUserItem): continue if info.gender == 0: gender = '女' elif info.gender == 1: gender = '男' else: gender = '未知' writer.writerow( (info.id, info.name, gender, info.avatar_url, info.business, info.headline, info.description, info.question_count, info.answer_count, info.article_count, info.voteup_count, info.thanked_count, info.favorited_count, info.following_count, info.follower_count, info.following_topic_count, info.following_column_count, info.following_question_count, info.following_favlist_count, '; '.join([str(edu) for edu in info.educations]), '; '.join([ str(emp) for emp in info.employments ]), '; '.join([str(loc) for loc in info.locations]))) csv_file.close() if log_zhihu: logging.info('Succeed in saving followers of zhihu user: %s.' % user)
def scrape_user_forums(self, user): if not isinstance(user, str): raise MethodParamError( 'Parameter \'user\' isn\'t an instance of type \'str\'!') if log_tieba: logging.info('Scraping forums of tieba user: %s...' % user) response = requests.get( tieba_user_profile_url.format(user=quote(user))) bs = BeautifulSoup(response.text, 'lxml') forum_div1 = bs.find('div', {'id': 'forum_group_wrap'}) forum_div2 = bs.find('div', {'class': 'j_panel_content'}) # 关注的吧需要展开才能显示完全 forums = [] if forum_div1 is not None: for forum_a in forum_div1.find_all('a', {'class': 'unsign'}): forums.append(forum_a.span.get_text()) if forum_div2 is not None: for forum_a in forum_div2.find_all('a', {'class': 'unsign'}): forums.append(forum_a.get_text()) if log_tieba: logging.info('Succeed in scraping forums of tieba user: %s.' % user) return forums
def scrape_question_by_id(self, id=0): if not isinstance(id, int): raise MethodParamError( 'Parameter \'id\' isn\'t an instance of type \'int\'!') if log_zhihu: logging.info('Scraping question of id: %d...' % id) response = requests.get(zhihu_question_url.format( id=id, include=zhihu_question_query), headers=zhihu_header) if response.status_code == 404: if log_zhihu: logging.warning('404 error. The question doesn\'t exist.') return None result = response.json() item = ZhihuQuestionItem() item.id = result.get('id') item.title = result.get('title') item.create_time = result.get('created') item.update_time = result.get('updated_time') page = requests.get('https://www.zhihu.com/question/%d' % id, headers=zhihu_header) bs = BeautifulSoup(page.text, 'lxml') content_div = bs.find('div', {'class': 'QuestionRichText'}) if content_div is not None: item.content = re.search(r'<span.*?>(.*)</span>', str(content_div.div.span)).group(1) item.follower_count = result.get('follower_count') item.visit_count = result.get('visit_count') item.comment_count = result.get('comment_count') topics = result.get('topics') if topics is not None: for topic in topics: item.topics.append(topic.get('name')) if log_zhihu: logging.info('Succeed in scraping question of id: %d.' % id) self.scraped_questions[id] = item return item
def scrape_user_activities(self, user, before=None, after=None, number=10): if not isinstance(user, str): raise MethodParamError( 'Parameter \'user\' isn\'t an instance of type \'str\'!') if not isinstance(number, int): raise MethodParamError( 'Parameter \'number\' isn\'t an instance of type \'int\'!') if before is None: before = int(time.time()) else: before = int(before) if after is None: after = 0 else: after = int(after) if number <= 0: number = 10 if log_zhihu: logging.info('Scraping activities of zhihu user: %s...' % user) response = requests.get(zhihu_user_activity_url.format(user=user, limit=10, after=before), headers=zhihu_header) result = response.json() activities = [] stop_flag = False while len(activities) < number: for data in result.get('data'): item = ZhihuActivityItem() item.id = int(data.get('id')) if item.id < after: stop_flag = True break item.verb = data.get('verb') item.create_time = data.get('created_time') item.actor = data.get('actor').get('url_token') target = data.get('target') if item.verb == 'QUESTION_CREATE' or item.verb == 'QUESTION_FOLLOW': # 关注了问题,添加了问题 item.target_user_name = target.get('author').get('name') item.target_user_avatar = target.get('author').get( 'avatar_url') item.target_user_headline = target.get('author').get( 'headline') item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format( user=target.get('author').get('url_token')) item.target_title = target.get('title') item.target_title_url = 'https://www.zhihu.com/question/{id}'.format( id=target.get('id')) elif item.verb == 'ANSWER_VOTE_UP' or item.verb == 'ANSWER_CREATE': # 赞同了回答,回答了问题 item.target_user_name = target.get('author').get('name') item.target_user_avatar = target.get('author').get( 'avatar_url') item.target_user_headline = target.get('author').get( 'headline') item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format( user=target.get('author').get('url_token')) item.target_title = target.get('question').get('title') item.target_title_url = 'https://www.zhihu.com/question/{id}'.format( id=target.get('question').get('id')) item.target_content = target.get('excerpt') item.target_content_url = 'https://www.zhihu.com/question/{qid}/answer/{aid}'.format( qid=target.get('question').get('id'), aid=target.get('id')) item.thumbnail = target.get('thumbnail') elif item.verb == 'MEMBER_VOTEUP_ARTICLE' or item.verb == 'MEMBER_CREATE_ARTICLE': # 赞了文章,发表了文章 item.target_user_name = target.get('author').get('name') item.target_user_avatar = target.get('author').get( 'avatar_url') item.target_user_headline = target.get('author').get( 'headline') item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format( user=target.get('author').get('url_token')) item.target_title = target.get('title') item.target_title_url = 'https://zhuanlan.zhihu.com/p/{id}'.format( id=target.get('id')) item.target_content = target.get('excerpt') item.target_content_url = 'https://zhuanlan.zhihu.com/p/{id}'.format( id=target.get('id')) item.thumbnail = target.get('image_url') elif item.verb == 'TOPIC_FOLLOW' or item.verb == 'TOPIC_CREATE': # 关注了话题,创建了话题 item.target_title = target.get('name') item.target_title_url = item.target_title_url = 'https://www.zhihu.com/topic/{id}'.format( id=target.get('id')) item.thumbnail = target.get('avatar_url') elif item.verb == 'MEMBER_FOLLOW_COLUMN' or item.verb == 'MEMBER_CREATE_COLUMN': # 关注了收藏夹,创建了收藏夹 item.target_user_name = target.get('author').get('name') item.target_user_avatar = target.get('author').get( 'avatar_url') item.target_user_headline = target.get('author').get( 'headline') item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format( user=target.get('author').get('url_token')) item.target_title = target.get('title') item.target_title_url = 'https://zhuanlan.zhihu.com/{id}'.format( id=target.get('id')) item.thumbnail = target.get('image_url') elif item.verb == 'MEMBER_CREATE_PIN' or item.verb == 'MEMBER_FOLLOW_PIN': # 发布了想法,关注了想法 item.target_user_name = target.get('author').get('name') item.target_user_avatar = target.get('author').get( 'avatar_url') item.target_user_headline = target.get('author').get( 'headline') item.target_user_url = 'https://www.zhihu.com/people/{user}/activities'.format( user=target.get('author').get('url_token')) item.target_content = target.get('excerpt_new') item.target_content_url = 'https://www.zhihu.com/pin/{id}'.format( id=target.get('id')) item.action_text = data.get('action_text') activities.append(item) if len(activities) >= number: break if len(activities) >= number or result.get('paging').get( 'is_end') or stop_flag: break response = requests.get(zhihu_user_activity_url.format( user=user, limit=10, after=activities[-1].id), headers=zhihu_header) result = response.json() if log_zhihu: logging.info('Succeed in scraping activities of zhihu user: %s.' % user) return activities
def scrape_user_weibo(self, id, before=None, after=None, number=0): if not isinstance(id, int): raise MethodParamError( 'Parameter \'id\' isn\'t an instance of type \'int\'!') if not isinstance(number, int): raise MethodParamError( 'Parameter \'number\' isn\'t an instance of type \'int\'!') before = int(time.time()) if before is None else int(before) after = 0 if after is None else int(after) if log_weibo: logging.info('Scraping weibos of weibo user: %d...' % id) response = requests.get( weibo_user_weibo_url.format(uid1=id, uid2=id, page=1)) result = response.json() total = result.get('data').get('cardlistInfo').get('total') if number <= 0: need_count = 10 else: need_count = number if number < total else total finish_count = 0 weibos = [] position = 0 stop_flag = False while finish_count < need_count: position += 1 print(weibo_user_weibo_url.format(uid1=id, uid2=id, page=position)) response = requests.get( weibo_user_weibo_url.format(uid1=id, uid2=id, page=position)) result = response.json() for card in result.get('data').get('cards'): try: if finish_count >= need_count: break if card.get('card_type') != 9: continue res = requests.get(card.get('scheme')) if '微博-出错了' in res.text: # 该微博已被删除 continue time_lst = re.search(r'"created_at": "(.*?)"', res.text).group(1).split() time_lst.pop(-2) # 删除时区信息 time_str = ' '.join(time_lst) time_value = time.mktime( time.strptime(time_str, '%a %b %d %H:%M:%S %Y')) # 获取时间戳 mblog = card.get('mblog') if time_value > before: continue if time_value < after: if not mblog.get('isTop'): # 置顶微博有可能造成异常 stop_flag = True break else: continue if 'retweeted_status' in mblog.keys(): # 转发微博 item = WeiboRepostContentItem() retweet = mblog.get('retweeted_status') item.content = retweet.get('text') item.source_id = retweet.get('bid') if 'pics' in retweet.keys(): for pic in retweet.get('pics'): item.pictures.append(pic.get('url')) if 'page_info' in retweet.keys(): item.media_pic = retweet.get('page_info').get( 'page_pic').get('url') page_url = retweet.get('page_info').get('page_url') if re.match( r'http://media\.weibo\.cn/article\?.*id=\d+', page_url): # 移动端文章链接打不开,将其换为PC端链接 article_id = re.search( r'http://media\.weibo\.cn/article\?.*id=(\d+)', page_url).group(1) item.media_url = 'https://weibo.com/ttarticle/p/show?id={id}'.format( id=article_id) else: item.media_url = page_url if retweet.get('user') is not None: # 原微博可能已被删除 item.source_url = 'https://weibo.com/{uid}/{bid}'.format( uid=retweet.get('user').get('id'), bid=item.source_id) item.source_owner.id = retweet.get('user').get( 'id') item.source_owner.name = retweet.get('user').get( 'screen_name') item.source_owner.avatar_url = retweet.get( 'user').get('profile_image_url') item.source_owner.profile_url = 'https://weibo.com/u/{uid}'.format( uid=item.source_owner.id) item.repost_reason = mblog.get('text') else: item = WeiboContentItem() item.content = mblog.get('text') if 'pics' in mblog.keys(): for pic in mblog.get('pics'): item.pictures.append(pic.get('url')) if 'page_info' in mblog.keys(): item.media_pic = mblog.get('page_info').get( 'page_pic').get('url') page_url = mblog.get('page_info').get('page_url') if re.match( r'http://media\.weibo\.cn/article\?.*id=\d+', page_url): # 移动端文章链接打不开,将其换为PC端链接 article_id = re.search( r'http://media\.weibo\.cn/article\?.*id=(\d+)', page_url).group(1) item.media_url = 'https://weibo.com/ttarticle/p/show?id={id}'.format( id=article_id) else: item.media_url = page_url item.id = mblog.get('bid') item.owner.id = mblog.get('user').get('id') item.owner.name = mblog.get('user').get('screen_name') item.owner.avatar_url = mblog.get('user').get( 'profile_image_url') item.owner.profile_url = 'https://weibo.com/u/{uid}'.format( uid=item.owner.id) item.url = 'https://weibo.com/{uid}/{bid}'.format( uid=item.owner.id, bid=item.id) item.time = time_value item.source = mblog.get('source') weibos.append(item) finish_count += 1 except AttributeError: continue if finish_count >= need_count or stop_flag: break if log_weibo: logging.info('Succeed in scraping weibos of weibo user: %d.' % id) self.scraped_weibos[id] = weibos return weibos
def scrape_user_info(self, user): if not isinstance(user, str): raise MethodParamError( 'Parameter \'user\' isn\'t an instance of type \'str\'!') if log_zhihu: logging.info('Scraping info of zhihu user: %s...' % user) response = requests.get(zhihu_user_info_url.format( user=user, include=zhihu_user_query), headers=zhihu_header) if response.status_code == 404: # 用户不存在或账号被封禁 if log_zhihu: logging.warning( '404 error. The user doesn\'t exist or has been blocked.') return None result = response.json() if result.get('error') is not None: # 身份未经过验证 if log_zhihu: logging.warning('Your identity hasn\'t been confirmed.') return None item = ZhihuUserItem() item.id = result.get('id') item.name = result.get('name') item.gender = result.get('gender') item.avatar_url = result.get('avatar_url') if 'business' in result.keys(): item.business = result.get('business').get('name') item.headline = result.get('headline') item.description = result.get('description') item.question_count = result.get('question_count') item.answer_count = result.get('answer_count') item.article_count = result.get('articles_count') item.voteup_count = result.get('voteup_count') item.thanked_count = result.get('thanked_count') item.favorited_count = result.get('favorited_count') item.following_count = result.get('following_count') item.follower_count = result.get('follower_count') item.following_topic_count = result.get('following_topic_count') item.following_column_count = result.get('following_columns_count') item.following_question_count = result.get('following_question_count') item.following_favlist_count = result.get('following_favlists_count') educations = result.get('educations') if educations is not None: for education in educations: edu_item = ZhihuEducationItem() edu_item.school = education.get('school').get('name') if 'major' in education.keys(): edu_item.major = education.get('major').get('name') item.educations.append(edu_item) employments = result.get('employments') if employments is not None: for employment in employments: emp_item = ZhihuEmploymentItem() if 'company' in employment.keys(): emp_item.company = employment.get('company').get('name') if 'job' in employment.keys(): emp_item.job = employment.get('job').get('name') item.employments.append(emp_item) locations = result.get('locations') if locations is not None: for location in locations: item.locations.append(location.get('name')) if log_zhihu: logging.info('Succeed in scraping info of zhihu user: %s.' % user) self.scraped_infos[user] = item return item