Beispiel #1
0
def get_user_info(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        #user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        #user.verify_type = public.get_verifytype(html)
        #user.verify_info = public.get_verifyreason(html, user.verify_type)
        #user.level = public.get_level(html)

        if user.name:
            print(user.name)
            with open(user.name + '.json', 'w', encoding='utf-8') as file:
                json.dump(user, file, default=lambda obj: obj.__dict__)

            return user
        else:
            return None

    else:
        return None
Beispiel #2
0
def get_person_one_full_page(pagenum, userid=None, name=None):
    page_weibo = []
    page = pagenum
    base_url_uid = 'https://www.weibo.com/u'
    base_url_name = 'https://www.weibo.com'
    if userid:
        base_url = base_url_uid
        para = userid
    elif name:
        base_url = base_url_name
        para = name
    else:
        return page_weibo
    assembled_url = '{}/{}?page={}'.format(base_url, para, page)
    first_req_html = get_page(assembled_url)
    userid = userid if userid else get_userid(first_req_html)
    page_weibo.extend(get_data(first_req_html, userid))
    page_sum = get_max_num(first_req_html)
    #page_sum = -1
    if page_sum == -1 and len(page_weibo) > 0:
        bar = 0
        weibo_html_0, weibo_list_0 = get_pagebar(userid, page, bar)
        weibo_html_1, weibo_list_1 = get_pagebar(
            userid, page, bar+1)
        if(len(weibo_html_0) != len(weibo_html_1)):
            weibo_list_0.extend(weibo_list_1)
            page_weibo.extend(weibo_list_0)
            # with open('weibohtml0.html', 'w', encoding='utf-8') as file:
            #     file.write(weibo_html_0)
            # with open('weibohtml1.html', 'w', encoding='utf-8') as file:
            #     file.write(weibo_html_1)

    return page_weibo
 def run(self):
     page_url = base_url + self.picture
     if web.is_page_reachable(page_url):
         img_page = web.get_page(page_url)
         _imgs = filter((lambda s: s.endswith(file_type)), (link for link in web.get_element(img_page, 'a', '')))
         img = next(_imgs.__iter__(), '')
         if img is not '':
             name = img.split('/')[-1].split(file_type)[0]
             self.output[name] = img
             print 'Added ' + name
 def run(self):
     page_url = base_url + self.wallpaper
     if web.is_page_reachable(page_url):
         img_page = web.get_page(page_url)
         img_links = web.get_element(img_page, 'a', '')
         for img_link in img_links:
             if img_link.startswith(imgsrc) and wallpaper_size in img_link:
                 if img_link is not '':
                     self.output[self.wallpaper.split('/')[-2]] = img_link
                     print 'Added ' + self.wallpaper.split('/')[-2]
def get_repost_weibo(feed_list_item):
    """
    获得转发微博原文
    :param feed_list_item: 单条微博的feed_list_item下的html
    :return: 返回源微博
    """
    handle = feed_list_item.find(attrs={'action-type': "fl_forward"})
    repost_wb_data = WeiboData()
    if handle:
        pattern = re.compile(r'rooturl=(https://weibo.com/\d+/\w+)')
        m = re.search(pattern, str(handle))
        if m:
            ori_url = m.group(1)
            html = get_page(ori_url)
            ori_weibo = get_data(html)
            if len(ori_weibo) == 1:
                repost_wb_data = ori_weibo[0]
    return repost_wb_data
Beispiel #6
0
def get_pagebar(userid, pagenum, bar):
    more_feed_base_url = 'https://weibo.com/p/aj/v6/mblog/mbloglist'
    ajwvr = 6
    domain = 100505
    # refer_flag = '1005055013_'  # 不必须
    is_all = 1
    pagebar = bar  # key para
    # pl_name = 'Pl_Official_MyProfileFeed__19'
    domain = 100505
    uid = userid
    # script_uri = username  # 不必须
    feed_type = 0
    page = pagenum
    pre_page = page
    domain_op = domain
    rnd = generate_rnd()
    assembled_more_feed_url = '{}?ajwvr={}&domain={}&is_all={}&pagebar={}&id={}&feed_type={}&page={}&pre_page={}&domain_op={}&_rnd={}'.format(
        more_feed_base_url, ajwvr, domain, is_all, pagebar, str(domain)+str(uid), feed_type, page, pre_page, domain_op, rnd)
    # ?ajwvr=6&domain=100505&refer_flag=1005055013_&is_all=1&pagebar=1&pl_name=Pl_Official_MyProfileFeed__19&
    # id=1005051340724027&script_uri=/tiancaixinxin&feed_type=0&page=2&pre_page=2&domain_op=100505&__rnd=1544605567059'
    html = get_page(assembled_more_feed_url)
    weibo_html = json.loads(html)['data']
    weibos = get_weibo_list(weibo_html)
    return weibo_html, weibos
#!/usr/bin/env python3
""" Testing expiring web cache and tracker """

from web import get_page

URL = "http://slowwly.robertomurray.co.uk/delay/5000/url/https://web.ics.purdue.edu/~gchopra/class/public/pages/webdesign/05_simple.html"

print(get_page(URL))

print("=================================================")
print("=================================================")
print("=================================================")

print(get_page(URL))
        self.picture = picture
        self.output = output

    def run(self):
        page_url = base_url + self.picture
        if web.is_page_reachable(page_url):
            img_page = web.get_page(page_url)
            _imgs = filter((lambda s: s.endswith(file_type)), (link for link in web.get_element(img_page, 'a', '')))
            img = next(_imgs.__iter__(), '')
            if img is not '':
                name = img.split('/')[-1].split(file_type)[0]
                self.output[name] = img
                print 'Added ' + name


page = web.get_page(base_url + album_url)
wallpapers = web.get_element(page, 'a', 'icon')

imgs = {}
threads = []

# Get all wallpaper urls
for wallpaper in wallpapers:
    latest -= 1
    if latest < 1:
        break
    thread = GetThread(wallpaper, imgs)
    thread.start()
    threads.append(thread)
for thread in threads:
    thread.join()
#!/usr/bin/env python3
"""
Main file for testing.
"""
from web import get_page, r


url = 'https://www.google.com'

name = 'count: ' + '{' + url + '}'

gp = get_page(url)
get_page(url)
get_page(url)
count = r.get(name)
print(count)
        self.wallpaper = wallpaper
        self.output = output

    def run(self):
        page_url = base_url + self.wallpaper
        if web.is_page_reachable(page_url):
            img_page = web.get_page(page_url)
            img_links = web.get_element(img_page, 'a', '')
            for img_link in img_links:
                if img_link.startswith(imgsrc) and wallpaper_size in img_link:
                    if img_link is not '':
                        self.output[self.wallpaper.split('/')[-2]] = img_link
                        print 'Added ' + self.wallpaper.split('/')[-2]


page = web.get_page(base_url + wallpaper_url)
links = web.get_element(page, 'a', '')
wallpapers = []
pattern = re.compile(r'/image/\d+/wallpaper')
for link in links:
    if pattern.match(link):
        wallpapers.append(link)

imgs = {}
threads = []

# Get all wallpaper urls
for wallpaper in wallpapers:
    thread = GetThread(wallpaper, imgs)
    thread.start()
    threads.append(thread)