def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info( 'Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def get_url_from_web(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url, auth_level=1) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() if samefollow_uid.strip() != '': samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None user.name = public.get_username(html) user.head_img = public.get_headimg(html) user.verify_type = public.get_verifytype(html) user.verify_info = public.get_verifyreason(html, user.verify_type) user.level = public.get_level(html) if user.name: UserOper.add_one(user) storage.info('Has stored user {id} info successfully'.format(id=user_id)) return user else: return None else: return None
def test_user_oper(self): user_list = list() for fake_id in FAKE_IDS: user_list.append(User(fake_id)) UserOper.add_all(user_list) rs = db_session.execute('select * from {}'.format(wbuser.name)) assert rs.rowcount > 0 assert UserOper.get_user_by_uid('5') is None assert UserOper.get_user_by_uid(FAKE_ID) is not None
def get_newcard_by_name(user_name): """ Get user by user_name through newcard method.\n Although it requires login, it is less likely to get banned since it requests without s.weibo.com. Arguments: user_name {str} -- [user's name] Returns: str, int -- [databse user object, is_crawled] """ user = UserOper.get_user_by_name(user_name) if user: is_crawled = 1 else: url = NEWCARD_URL.format(quote(user_name), int(round(time.time() * 1000))) page = get_page(url) if page.strip() == '': return None, 0 uid = person.get_uid_and_samefollow_by_new_card(page) if uid == -1: return None, 0 user, is_crawled = get_profile(uid) return user, is_crawled
def get_user_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) else: user = get_url_from_web(user_id) return user
def get_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) SeedidsOper.set_seed_crawled(user_id, 1) is_crawled = 1 else: user = get_url_from_web(user_id) if user: UserOper.set_user(user) if user is not None: SeedidsOper.set_seed_crawled(user_id, 1) else: SeedidsOper.set_seed_crawled(user_id, 2) is_crawled = 0 return user, is_crawled
def get_uid_by_name(user_name): """通过用户名获取用户uid""" user = UserOper.get_user_by_name(user_name) if user: return user.uid url = "http://s.weibo.com/ajax/topsuggest.php?key={}&_k=14995588919022710&uid=&_t=1&_v=STK_14995588919022711" url = url.format(quote(user_name)) info = requests.get(url).content.decode() pattern = r'try\{.*\((.*)\).*\}catch.*' pattern = re.compile(pattern) info = pattern.match(info).groups()[0] info = json.loads(info) try: return info["data"]["user"][0]['u_id'] except Exception as e: print(e) return None
def get_profile(user_id): """ :param user_id: uid :return: user info and is crawled or not """ user = UserOper.get_user_by_uid(user_id) if user: storage.info('user {id} has already crawled'.format(id=user_id)) SeedidsOper.set_seed_crawled(user_id, 1) is_crawled = 1 else: user = get_url_from_web(user_id) if user is not None: SeedidsOper.set_seed_crawled(user_id, 1) else: SeedidsOper.set_seed_crawled(user_id, 2) is_crawled = 0 return user, is_crawled
from jieba.analyse import tfidf import xlwt from db.dao import UserOper from jieba import analyse import datetime import csv if __name__ == '__main__': book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('test', cell_overwrite_ok=True) lists = [ 'gender', 'birthday', 'location', 'verify_type', ] i = 0 for list in lists: sheet.write(0, i, list) i += 1 infos = UserOper.get_all() i = 1 nowTime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') for info in infos: sheet.write(i, 0, info.gender) sheet.write(i, 1, info.birthday) sheet.write(i, 2, info.location) sheet.write(i, 3, info.verify_type) i += 1 book.save(r'file-' + nowTime + '.xls')