Esempio n. 1
0
def crawl_follower_fans(uid):
    user, is_crawled = get_profile(uid)
    if user and user.verify_type == 2:
        SeedidsOper.set_seed_other_crawled(uid)
        return

    rs = get_fans_or_followers_ids(uid, 1, 1)
    rs.extend(get_fans_or_followers_ids(uid, 2, 1))
    datas = set(rs)
    for uid in datas:
        get_profile(uid)
    # If data already exits, just skip it
    # if datas:
    #     SeedidsOper.insert_seeds(datas)
    SeedidsOper.set_seed_other_crawled(uid)
Esempio n. 2
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            app.send_task('tasks.user.crawl_follower_fans',
                          args=(uid, ),
                          queue='fans_followers',
                          routing_key='for_fans_followers')

    # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker
    # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10
    except SoftTimeLimitExceeded:
        crawler.error(
            "user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        app.send_task('tasks.user.crawl_person_infos',
                      args=(uid, ),
                      queue='user_crawler',
                      routing_key='for_user_info')
Esempio n. 3
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                          routing_key='for_fans_followers')

    # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker
    # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10
    except SoftTimeLimitExceeded:
        crawler.error("user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler',
                      routing_key='for_user_info')
Esempio n. 4
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user, _ = get_profile(uid)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            for index, repost_obj in enumerate(cur_repost_datas):
                print(repost_obj)
                user_id = IdNames.fetch_uid_by_name(
                    repost_obj.parent_user_name)
                if not user_id and root_user:
                    # when it comes to errors, set the args to default(root)
                    repost_obj.parent_user_id = root_user.uid
                    repost_obj.parent_user_name = root_user.name
                else:
                    repost_obj.parent_user_id = user_id
                repost_datas[index] = repost_obj
                RepostOper.add_all(repost_datas)
Esempio n. 5
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user, _ = get_profile(uid)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # when it comes to errors, set the args to default(root)
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    RepostOper.add_all(repost_datas)
Esempio n. 6
0
def get_praise_list(html: str, wb_id: str):
    """[get praise list]
    
    Arguments:
        html {str} -- [web page]
        wb_id {str} -- [weibo mid]
    
    Raises:
        in -- [can't get wanted dom]
    
    Returns:
        WeiboPraise list -- [list contains praises in this html]
        ext_param -- [extra parameters to get next page]
    """

    cont = get_html_cont(html)
    if not cont:
        return list(), ''

    soup = BeautifulSoup(cont, 'html.parser')
    praise_list = list()
    praises = soup.find_all(attrs={'class': 'list_li S_line1 clearfix'})
    # pattern = re.compile(r'<li uid=\\"\d{10}\\">')
    # praises = pattern.findall(cont)

    for praise in praises:
        try:
            user_id = praise.find('img').get('usercard')[3:]
            get_profile(user_id)
            wb_praise = WeiboPraise(user_id, wb_id)
        except Exception as e:
            parser.error('解析点赞失败,具体信息是{}'.format(e))
        else:
            praise_list.append(wb_praise)

    like_loading = soup.find(attrs={'node-type': 'like_loading'})
    feed_like_more = soup.find(attrs={'action-type': 'feed_like_more'})
    if like_loading:
        action_data = like_loading.get('action-data', '')
    elif feed_like_more:
        action_data = feed_like_more.get('action-data', '')
    else:
        action_data = ''
    ext_param = htmllib.unescape(action_data)

    return praise_list, ext_param
Esempio n. 7
0
def crawl_user_info(name):
    """抓取用户首页的信息
    :param name: 用户名
    :return: None
    """
    if not name:
        return None

    crawler.info(f"received task crawl_user_info {name}")
    user, other_crawled = get_profile(name)
    if not other_crawled:
        crawler.info(f"send task crawl_follower_fans {user.name}")
        app.send_task("tasks.user.crawl_follower_fans", args=(user.name, ))
Esempio n. 8
0
def crawl_person_infos(uid):

    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            crawl_follower_fans(uid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        crawl_person_infos(uid)
Esempio n. 9
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    user, is_crawled = get_profile(uid)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        SeedidsOper.set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                      routing_key='for_fans_followers')
Esempio n. 10
0
def test_parse_user_info(uid, expect_name):
    user_info = get_profile(uid)[0]
    assert user_info.name == expect_name
    time.sleep(REQUEST_INTERNAL)
Esempio n. 11
0
def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk',
                                                          'ignore').decode(
                                                              'gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列(seed_ids)
            wb_repost.user_id = repost.find(attrs={
                'class': 'WB_face W_fl'
            }).find('a').get('usercard')[3:]
            get_profile(wb_repost.user_id)
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={
                'class': 'WB_from S_txt2'
            }).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(
                repost.find(attrs={
                    'class': 'WB_from S_txt2'
                }).find('a').get('href'))
            parents = repost.find(attrs={
                'class': 'WB_text'
            }).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error(
                        "error occurred when parsing the parent's name ,the detail is {}"
                        .format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error(
                'repost parse error occurred,the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
Esempio n. 12
0
def test_crawl_userinfo(uid, expect):
    user = get_profile(uid)[0]
    assert user.name == expect
    time.sleep(REQUEST_INTERNAL)
Esempio n. 13
0
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html5lib')
    comment_list = list()
    comments = soup.find(attrs={
        'node-type': 'comment_list'
    }).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            cont = []
            first_author = True
            first_colon = True
            for content in comment.find(attrs={'class': 'WB_text'}).contents:
                if not content:
                    continue
                if content.name == 'a':
                    if first_author:
                        first_author = False
                        continue
                    else:
                        if content.text:
                            cont.append(content.text)

                elif content.name == 'img':
                    img_title = content.get('title', '')
                    if img_title == '':
                        img_title = content.get('alt', '')
                    if img_title == '':
                        img_src = content.get('src', '')
                        img_src = img_src.split('/')[-1].split('.', 1)[0]
                        try:
                            img_title = parse_emoji.softband_to_utf8(img_src)
                        except Exception as e:
                            parser.error('解析表情失败,具体信息是{},{}'.format(
                                e, comment))
                            img_title = ''
                    cont.append(img_title)

                else:
                    if first_colon:
                        if content.find(':') == 0:
                            cont.append(content.replace(':', '', 1))
                            first_colon = False
                    else:
                        cont.append(content)

            wb_comment.comment_cont = ''.join(cont)
            wb_comment.comment_screen_name = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').text

            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列(seed_ids)
            wb_comment.user_id = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').get('usercard')[3:]
            # 爬取新用户基本信息
            if wb_comment.user_id:
                get_profile(wb_comment.user_id)
            # 日期格式化
            create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            if '分钟前' in create_time:
                now = datetime.datetime.now()
                reduce_minute = create_time.strip().split('分钟')[0]
                delta = datetime.timedelta(minutes=int(reduce_minute))
                real_time = now - delta
                wb_comment.create_time = str(
                    real_time.strftime('%Y-%m-%d %H:%M'))
            elif '今天' in create_time:
                now = datetime.datetime.now().strftime('%Y-%m-%d')
                real_time = now + create_time.strip().split('今天')[-1]
                wb_comment.create_time = str(real_time)
            elif '楼' in create_time:
                wb_comment.create_time = str(re.sub('第\d*楼', '', create_time))
            else:
                wb_comment.create_time = create_time
            if not wb_comment.create_time.startswith('201'):
                wb_comment.create_time = str(
                    datetime.datetime.now().year) + wb_comment.create_time

            # 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M"
            create_time_copy = wb_comment.create_time
            if '月' in create_time_copy and '日' in create_time_copy:
                month = create_time_copy.split("年")[-1].split("月")[0]
                day = create_time_copy.split("年")[-1].split("月")[-1].split(
                    "日")[0]
                # 补齐0
                if month and int(month) < 10:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        str(month) + "月", "0" + str(month) + "月")
                if day and int(day) < 10:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        str(day) + "日", "0" + str(day) + "日")
                wb_comment.create_time = wb_comment.create_time.replace(
                    "月", "-")
                wb_comment.create_time = wb_comment.create_time.replace(
                    "日", "")
                if '年' in wb_comment.create_time:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        "年", "-")

            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list