def get_user_info(user_id): """ Get user info according to user id. If user domain is 100505,the url is just 100505+userid; If user domain is 103505 or 100306, we need to request once more to get his info If user type is enterprise or service, we just crawl their home page info :param: user id :return: user entity """ if not user_id: return None url = BASE_URL.format('100505', user_id) html = get_page(url) if not is_404(html): domain = public.get_userdomain(html) # writers(special users) if domain == '103505' or domain == '100306': url = BASE_URL.format(domain, user_id) html = get_page(url) user = get_user_detail(user_id, html) # normal users elif domain == '100505': user = get_user_detail(user_id, html) # enterprise or service else: user = get_enterprise_detail(user_id, html) if user is None: return None #user.name = public.get_username(html) user.head_img = public.get_headimg(html) #user.verify_type = public.get_verifytype(html) #user.verify_info = public.get_verifyreason(html, user.verify_type) #user.level = public.get_level(html) if user.name: print(user.name) with open(user.name + '.json', 'w', encoding='utf-8') as file: json.dump(user, file, default=lambda obj: obj.__dict__) return user else: return None else: return None
def get_person_one_full_page(pagenum, userid=None, name=None): page_weibo = [] page = pagenum base_url_uid = 'https://www.weibo.com/u' base_url_name = 'https://www.weibo.com' if userid: base_url = base_url_uid para = userid elif name: base_url = base_url_name para = name else: return page_weibo assembled_url = '{}/{}?page={}'.format(base_url, para, page) first_req_html = get_page(assembled_url) userid = userid if userid else get_userid(first_req_html) page_weibo.extend(get_data(first_req_html, userid)) page_sum = get_max_num(first_req_html) #page_sum = -1 if page_sum == -1 and len(page_weibo) > 0: bar = 0 weibo_html_0, weibo_list_0 = get_pagebar(userid, page, bar) weibo_html_1, weibo_list_1 = get_pagebar( userid, page, bar+1) if(len(weibo_html_0) != len(weibo_html_1)): weibo_list_0.extend(weibo_list_1) page_weibo.extend(weibo_list_0) # with open('weibohtml0.html', 'w', encoding='utf-8') as file: # file.write(weibo_html_0) # with open('weibohtml1.html', 'w', encoding='utf-8') as file: # file.write(weibo_html_1) return page_weibo
def run(self): page_url = base_url + self.picture if web.is_page_reachable(page_url): img_page = web.get_page(page_url) _imgs = filter((lambda s: s.endswith(file_type)), (link for link in web.get_element(img_page, 'a', ''))) img = next(_imgs.__iter__(), '') if img is not '': name = img.split('/')[-1].split(file_type)[0] self.output[name] = img print 'Added ' + name
def run(self): page_url = base_url + self.wallpaper if web.is_page_reachable(page_url): img_page = web.get_page(page_url) img_links = web.get_element(img_page, 'a', '') for img_link in img_links: if img_link.startswith(imgsrc) and wallpaper_size in img_link: if img_link is not '': self.output[self.wallpaper.split('/')[-2]] = img_link print 'Added ' + self.wallpaper.split('/')[-2]
def get_repost_weibo(feed_list_item): """ 获得转发微博原文 :param feed_list_item: 单条微博的feed_list_item下的html :return: 返回源微博 """ handle = feed_list_item.find(attrs={'action-type': "fl_forward"}) repost_wb_data = WeiboData() if handle: pattern = re.compile(r'rooturl=(https://weibo.com/\d+/\w+)') m = re.search(pattern, str(handle)) if m: ori_url = m.group(1) html = get_page(ori_url) ori_weibo = get_data(html) if len(ori_weibo) == 1: repost_wb_data = ori_weibo[0] return repost_wb_data
def get_pagebar(userid, pagenum, bar): more_feed_base_url = 'https://weibo.com/p/aj/v6/mblog/mbloglist' ajwvr = 6 domain = 100505 # refer_flag = '1005055013_' # 不必须 is_all = 1 pagebar = bar # key para # pl_name = 'Pl_Official_MyProfileFeed__19' domain = 100505 uid = userid # script_uri = username # 不必须 feed_type = 0 page = pagenum pre_page = page domain_op = domain rnd = generate_rnd() assembled_more_feed_url = '{}?ajwvr={}&domain={}&is_all={}&pagebar={}&id={}&feed_type={}&page={}&pre_page={}&domain_op={}&_rnd={}'.format( more_feed_base_url, ajwvr, domain, is_all, pagebar, str(domain)+str(uid), feed_type, page, pre_page, domain_op, rnd) # ?ajwvr=6&domain=100505&refer_flag=1005055013_&is_all=1&pagebar=1&pl_name=Pl_Official_MyProfileFeed__19& # id=1005051340724027&script_uri=/tiancaixinxin&feed_type=0&page=2&pre_page=2&domain_op=100505&__rnd=1544605567059' html = get_page(assembled_more_feed_url) weibo_html = json.loads(html)['data'] weibos = get_weibo_list(weibo_html) return weibo_html, weibos
#!/usr/bin/env python3 """ Testing expiring web cache and tracker """ from web import get_page URL = "http://slowwly.robertomurray.co.uk/delay/5000/url/https://web.ics.purdue.edu/~gchopra/class/public/pages/webdesign/05_simple.html" print(get_page(URL)) print("=================================================") print("=================================================") print("=================================================") print(get_page(URL))
self.picture = picture self.output = output def run(self): page_url = base_url + self.picture if web.is_page_reachable(page_url): img_page = web.get_page(page_url) _imgs = filter((lambda s: s.endswith(file_type)), (link for link in web.get_element(img_page, 'a', ''))) img = next(_imgs.__iter__(), '') if img is not '': name = img.split('/')[-1].split(file_type)[0] self.output[name] = img print 'Added ' + name page = web.get_page(base_url + album_url) wallpapers = web.get_element(page, 'a', 'icon') imgs = {} threads = [] # Get all wallpaper urls for wallpaper in wallpapers: latest -= 1 if latest < 1: break thread = GetThread(wallpaper, imgs) thread.start() threads.append(thread) for thread in threads: thread.join()
#!/usr/bin/env python3 """ Main file for testing. """ from web import get_page, r url = 'https://www.google.com' name = 'count: ' + '{' + url + '}' gp = get_page(url) get_page(url) get_page(url) count = r.get(name) print(count)
self.wallpaper = wallpaper self.output = output def run(self): page_url = base_url + self.wallpaper if web.is_page_reachable(page_url): img_page = web.get_page(page_url) img_links = web.get_element(img_page, 'a', '') for img_link in img_links: if img_link.startswith(imgsrc) and wallpaper_size in img_link: if img_link is not '': self.output[self.wallpaper.split('/')[-2]] = img_link print 'Added ' + self.wallpaper.split('/')[-2] page = web.get_page(base_url + wallpaper_url) links = web.get_element(page, 'a', '') wallpapers = [] pattern = re.compile(r'/image/\d+/wallpaper') for link in links: if pattern.match(link): wallpapers.append(link) imgs = {} threads = [] # Get all wallpaper urls for wallpaper in wallpapers: thread = GetThread(wallpaper, imgs) thread.start() threads.append(thread)