Beispiel #1
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info(
                'Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Beispiel #2
0
def get_url_from_web(user_id):
    """
    Get user info according to user id.
    If user domain is 100505,the url is just 100505+userid;
    If user domain is 103505 or 100306, we need to request once more to get his info
    If user type is enterprise or service, we just crawl their home page info
    :param: user id
    :return: user entity
    """
    if not user_id:
        return None

    url = BASE_URL.format('100505', user_id)
    html = get_page(url, auth_level=1)

    if not is_404(html):
        domain = public.get_userdomain(html)

        # writers(special users)
        if domain == '103505' or domain == '100306':
            url = BASE_URL.format(domain, user_id)
            html = get_page(url)
            user = get_user_detail(user_id, html)
        # normal users
        elif domain == '100505':
            user = get_user_detail(user_id, html)
            samefollow_uid = get_samefollow_uid()
            if samefollow_uid.strip() != '':
                samefollow_uid = samefollow_uid.split(',')
                url = SAMEFOLLOW_URL.format(user_id)
                isFanHtml = get_page(url, auth_level=2)
                person.get_isFan(isFanHtml, samefollow_uid, user_id)
        # enterprise or service
        else:
            user = get_enterprise_detail(user_id, html)

        if user is None:
            return None

        user.name = public.get_username(html)
        user.head_img = public.get_headimg(html)
        user.verify_type = public.get_verifytype(html)
        user.verify_info = public.get_verifyreason(html, user.verify_type)
        user.level = public.get_level(html)

        if user.name:
            UserOper.add_one(user)
            storage.info('Has stored user {id} info successfully'.format(id=user_id))
            return user
        else:
            return None

    else:
        return None
Beispiel #3
0
    def test_user_oper(self):
        user_list = list()
        for fake_id in FAKE_IDS:
            user_list.append(User(fake_id))
        UserOper.add_all(user_list)

        rs = db_session.execute('select * from {}'.format(wbuser.name))
        assert rs.rowcount > 0

        assert UserOper.get_user_by_uid('5') is None
        assert UserOper.get_user_by_uid(FAKE_ID) is not None
    def test_user_oper(self):
        user_list = list()
        for fake_id in FAKE_IDS:
            user_list.append(User(fake_id))
        UserOper.add_all(user_list)

        rs = db_session.execute('select * from {}'.format(wbuser.name))
        assert rs.rowcount > 0

        assert UserOper.get_user_by_uid('5') is None
        assert UserOper.get_user_by_uid(FAKE_ID) is not None
Beispiel #5
0
def get_newcard_by_name(user_name):
    """
    Get user by user_name through newcard method.\n
    Although it requires login, it is less likely to get banned
    since it requests without s.weibo.com.

    Arguments:
        user_name {str} -- [user's name]
    Returns:
        str, int -- [databse user object, is_crawled]
    """

    user = UserOper.get_user_by_name(user_name)
    if user:
        is_crawled = 1
    else:
        url = NEWCARD_URL.format(quote(user_name), int(round(time.time() * 1000)))
        page = get_page(url)
        if page.strip() == '':
            return None, 0
        uid = person.get_uid_and_samefollow_by_new_card(page)
        if uid == -1:
            return None, 0
        user, is_crawled = get_profile(uid)
    return user, is_crawled
Beispiel #6
0
def get_newcard_by_name(user_name):
    """
    Get user by user_name through newcard method.\n
    Although it requires login, it is less likely to get banned
    since it requests without s.weibo.com.

    Arguments:
        user_name {str} -- [user's name]
    Returns:
        str, int -- [databse user object, is_crawled]
    """

    user = UserOper.get_user_by_name(user_name)
    if user:
        is_crawled = 1
    else:
        url = NEWCARD_URL.format(quote(user_name), int(round(time.time() * 1000)))
        page = get_page(url)
        if page.strip() == '':
            return None, 0
        uid = person.get_uid_and_samefollow_by_new_card(page)
        if uid == -1:
            return None, 0
        user, is_crawled = get_profile(uid)
    return user, is_crawled
Beispiel #7
0
def get_user_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
    else:
        user = get_url_from_web(user_id)
    return user
Beispiel #8
0
def get_user_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
    else:
        user = get_url_from_web(user_id)
    return user
Beispiel #9
0
def get_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        SeedidsOper.set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id)
        if user:
            UserOper.set_user(user)
        if user is not None:
            SeedidsOper.set_seed_crawled(user_id, 1)
        else:
            SeedidsOper.set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Beispiel #10
0
def get_uid_by_name(user_name):
    """通过用户名获取用户uid"""
    user = UserOper.get_user_by_name(user_name)
    if user:
        return user.uid
    url = "http://s.weibo.com/ajax/topsuggest.php?key={}&_k=14995588919022710&uid=&_t=1&_v=STK_14995588919022711"
    url = url.format(quote(user_name))
    info = requests.get(url).content.decode()

    pattern = r'try\{.*\((.*)\).*\}catch.*'
    pattern = re.compile(pattern)
    info = pattern.match(info).groups()[0]
    info = json.loads(info)
    try:
        return info["data"]["user"][0]['u_id']
    except Exception as e:
        print(e)
        return None
Beispiel #11
0
def get_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        SeedidsOper.set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            SeedidsOper.set_seed_crawled(user_id, 1)
        else:
            SeedidsOper.set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Beispiel #12
0
from jieba.analyse import tfidf
import xlwt
from db.dao import UserOper
from jieba import analyse
import datetime
import csv
if __name__ == '__main__':
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('test', cell_overwrite_ok=True)
    lists = [
        'gender',
        'birthday',
        'location',
        'verify_type',
    ]
    i = 0
    for list in lists:
        sheet.write(0, i, list)
        i += 1
    infos = UserOper.get_all()
    i = 1
    nowTime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    for info in infos:
        sheet.write(i, 0, info.gender)
        sheet.write(i, 1, info.birthday)
        sheet.write(i, 2, info.location)
        sheet.write(i, 3, info.verify_type)
        i += 1
    book.save(r'file-' + nowTime + '.xls')