def get_fans_or_followers_names(name, crawl_type): """ 抓取用户和粉丝 :param name: 用户名 :param crawl_type: 抓取类型。 followees: 关注, followers: 粉丝 :return: """ LIMIT = 20 page = 1 is_end = False max_follow_page = get_max_follow_page() while (not is_end) and (page < max_follow_page): url = FOLLOW_URL.format(name, crawl_type, (page - 1) * LIMIT, LIMIT) html = get_page(url) user_names, is_end = get_fans_or_follows(html, name) storage.info( f"get {name} {crawl_type}: user_names: {user_names}, is_end:{is_end}" ) SeedUser.insert_many(user_names) page += 1 storage.info( f"get {name} page={page}, max_follow_page={max_follow_page}, is_end={is_end}" )
def add(cls, data): session = new_session() try: session.add(data) session.commit() return True except SqlalchemyIntegrityError as e: storage.info(e) return False
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info( 'Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info('Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_user_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) else: user = get_url_from_web(user_id) return user
def get_user_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) else: user = get_url_from_web(user_id) return user
def get_hot_list_from_web(title): if not title: return None url = HOT_LIST_URL.format(title) html = get_page(url) all_lists = parse_hot_list(title, html) if all_lists: CommonOperate.add_all(all_lists) storage.info(f"Has stored hot_list {title} info successfully") return all_lists
def get_user_info_from_web(user_name): """从网络抓取用户信息 :param: user_name 用户名 :return: user entiry """ if not user_name: return None url = USER_HOME_URL.format(user_name) html = get_page(url) user = get_user_detail(user_name, html) if user: CommonOperate.add_one(user) storage.info(f"Has stored user {user_name} info successfully") return user
def get_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) SeedidsOper.set_seed_crawled(user_id, 1) is_crawled = 1 else: user = get_url_from_web(user_id) if user is not None: SeedidsOper.set_seed_crawled(user_id, 1) else: SeedidsOper.set_seed_crawled(user_id, 2) is_crawled = 0 return user, is_crawled
def get_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) SeedidsOper.set_seed_crawled(user_id, 1) is_crawled = 1 else: user = get_url_from_web(user_id) if user is not None: SeedidsOper.set_seed_crawled(user_id, 1) else: SeedidsOper.set_seed_crawled(user_id, 2) is_crawled = 0 return user, is_crawled
def get_profile(user_name): """ :param user_name: 用户名 : return TODO """ user = User.get_user_by_name(user_name) if user: storage.info(f"user {user_name} has already crawled") SeedUser.set_home_crawled(user_name, 1) else: storage.info(f"user {user_name} not exist, start crawling...") user = get_user_info_from_web(user_name) if user: SeedUser.set_home_crawled(user_name, 1) else: SeedUser.set_home_crawled(user_name, 2) other_crawled = SeedUser.get_seed_by_name(user_name).other_crawled storage.info(f"{user_name} other_crawled {other_crawled}") return user, other_crawled
def get_hot_list(title): hot_list = get_hot_list_from_web(title) storage.info(f"hot_list: {hot_list}") return hot_list
def get_user_detail(user_name, html): storage.info("get_detail") user = get_detail(user_name, html) return user
# -*- coding: utf8 -*- """ test logger """ import sys import os.path sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from logger import crawler, storage crawler.info('crawler') storage.info('database connect error')