Ejemplo n.º 1
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                          routing_key='for_fans_followers')

    # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker
    # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10
    except SoftTimeLimitExceeded:
        crawler.error("user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler',
                      routing_key='for_user_info')
Ejemplo n.º 2
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            app.send_task('tasks.user.crawl_follower_fans',
                          args=(uid, ),
                          queue='fans_followers',
                          routing_key='for_fans_followers')

    # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker
    # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10
    except SoftTimeLimitExceeded:
        crawler.error(
            "user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        app.send_task('tasks.user.crawl_person_infos',
                      args=(uid, ),
                      queue='user_crawler',
                      routing_key='for_user_info')
Ejemplo n.º 3
0
def crawl_follower_fans(uid):
    seed = SeedidsOper.get_seed_by_id(uid)
    if seed.other_crawled == 0:
        rs = get_fans_or_followers_ids(uid, 1, 1)
        rs.extend(get_fans_or_followers_ids(uid, 2, 1))
        datas = set(rs)
        # If data already exits, just skip it
        if datas:
            SeedidsOper.insert_seeds(datas)
        SeedidsOper.set_seed_other_crawled(uid)
Ejemplo n.º 4
0
def crawl_follower_fans(uid):
    seed = SeedidsOper.get_seed_by_id(uid)
    if seed.other_crawled == 0:
        rs = get_fans_or_followers_ids(uid, 1, 1)
        rs.extend(get_fans_or_followers_ids(uid, 2, 1))
        datas = set(rs)
        # If data already exits, just skip it
        if datas:
            SeedidsOper.insert_seeds(datas)
        SeedidsOper.set_seed_other_crawled(uid)
Ejemplo n.º 5
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        length_weibo_datas = len(weibo_datas)
        for i in range(0, len(weibo_datas)):
            weibo_time = time.mktime(
                time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
            if weibo_time < timeafter:
                weibo_datas = weibo_datas[0:i]
                break

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if i != length_weibo_datas - 1:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Ejemplo n.º 6
0
def crawl_follower_fans(uid):
    user, is_crawled = get_profile(uid)
    if user and user.verify_type == 2:
        SeedidsOper.set_seed_other_crawled(uid)
        return

    rs = get_fans_or_followers_ids(uid, 1, 1)
    rs.extend(get_fans_or_followers_ids(uid, 2, 1))
    datas = set(rs)
    for uid in datas:
        get_profile(uid)
    # If data already exits, just skip it
    # if datas:
    #     SeedidsOper.insert_seeds(datas)
    SeedidsOper.set_seed_other_crawled(uid)
Ejemplo n.º 7
0
def crawl_comment_by_page(mid, page_num, seeion):
    try:
        cur_url = BASE_URL.format(mid, page_num)
        html = get_page(cur_url, auth_level=1, is_ajax=True)
        comment_datas, seed_ids = comment.get_comment_list(html, mid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "comment SoftTimeLimitExceeded    mid={mid} page_num={page_num}".
            format(mid=mid, page_num=page_num))
        crawl_comment_by_page(mid, page_num)
    CommentOper.add_all(comment_datas, seeion)
    SeedidsOper.insert_seeds(seed_ids, seeion)
    if page_num == 1:
        WbDataOper.set_weibo_comment_crawled(mid, seeion)
    return html, comment_datas
Ejemplo n.º 8
0
def execute_home_task():
    # you can have many strategies to crawl user's home page, here we choose table seed_ids's uid
    # whose home_crawl is 0
    id_objs = SeedidsOper.get_home_ids()
    for id_obj in id_objs:
        app.send_task('tasks.home.crawl_weibo_datas', args=(id_obj.uid,), queue='home_crawler',
                      routing_key='home_info')
Ejemplo n.º 9
0
def execute_home_task():
    # you can have many strategies to crawl user's home page, here we choose table seed_ids's uid
    # whose home_crawl is 0
    id_objs = SeedidsOper.get_home_ids()
    for id_obj in id_objs:
        app.send_task('tasks.home.crawl_weibo_datas', args=(id_obj.uid,), queue='home_crawler',
                      routing_key='home_info')
Ejemplo n.º 10
0
def execute_user_task():
    seeds = SeedidsOper.get_seed_ids()
    if seeds:
        for seed in seeds:
            app.send_task('tasks.user.crawl_person_infos',
                          args=(seed.uid, ),
                          queue='user_crawler',
                          routing_key='for_user_info')
Ejemplo n.º 11
0
    def test_seedids_oper(self):
        SeedidsOper.insert_seeds(FAKE_IDS)
        assert len(SeedidsOper.get_seed_ids()) == 2
        assert SeedidsOper.get_seed_by_id(FAKE_ID) is not None

        SeedidsOper.set_seed_crawled(FAKE_ID, 1)
        assert len(SeedidsOper.get_seed_ids()) == 1
Ejemplo n.º 12
0
    def test_seedids_oper(self):
        SeedidsOper.insert_seeds(FAKE_IDS)
        assert len(SeedidsOper.get_seed_ids()) == 2
        assert SeedidsOper.get_seed_by_id(FAKE_ID) is not None

        SeedidsOper.set_seed_crawled(FAKE_ID, 1)
        assert len(SeedidsOper.get_seed_ids()) == 1
Ejemplo n.º 13
0
def crawl_person_infos(uid):

    if not uid:
        return

    try:
        user, is_crawled = get_profile(uid)
        # If it's enterprise user, just skip it
        if user and user.verify_type == 2:
            SeedidsOper.set_seed_other_crawled(uid)
            return

        # Crawl fans and followers
        if not is_crawled:
            crawl_follower_fans(uid)
    except SoftTimeLimitExceeded:
        crawler.error(
            "user SoftTimeLimitExceeded    uid={uid}".format(uid=uid))
        crawl_person_infos(uid)
Ejemplo n.º 14
0
def execute_user_task():
    # 查找seed表中报错的所有用户id,条件为is_crawled=0
    seeds = SeedidsOper.get_seed_ids()
    if seeds:
        for seed in seeds:
            # 提交任务
            app.send_task('tasks.user.crawl_person_infos',
                          args=(seed.uid, ),
                          queue='user_crawler',
                          routing_key='for_user_info')
Ejemplo n.º 15
0
def crawl_person_infos(uid):
    """
    Crawl user info and their fans and followers
    For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers.
    We also have no permissions to view enterprise's followers and fans info
    :param uid: current user id
    :return: None
    """
    if not uid:
        return

    user, is_crawled = get_profile(uid)
    # If it's enterprise user, just skip it
    if user and user.verify_type == 2:
        SeedidsOper.set_seed_other_crawled(uid)
        return

    # Crawl fans and followers
    if not is_crawled:
        app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers',
                      routing_key='for_fans_followers')
Ejemplo n.º 16
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        WbDataOper.add_all(weibo_datas)

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1
        else:
            auth_level = 2

        if total_page < limit:
            limit = total_page

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Ejemplo n.º 17
0
def get_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        SeedidsOper.set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            SeedidsOper.set_seed_crawled(user_id, 1)
        else:
            SeedidsOper.set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Ejemplo n.º 18
0
def get_profile(user_id):
    """
    :param user_id: uid
    :return: user info and is crawled or not
    """
    user = UserOper.get_user_by_uid(user_id)

    if user:
        storage.info('user {id} has already crawled'.format(id=user_id))
        SeedidsOper.set_seed_crawled(user_id, 1)
        is_crawled = 1
    else:
        user = get_url_from_web(user_id)
        if user is not None:
            SeedidsOper.set_seed_crawled(user_id, 1)
        else:
            SeedidsOper.set_seed_crawled(user_id, 2)
        is_crawled = 0

    return user, is_crawled
Ejemplo n.º 19
0
from jieba.analyse import tfidf
import xlwt
from db.dao import CommentOper
from jieba import analyse
from db.dao import SeedidsOper
import datetime
import csv
if __name__ == '__main__':
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('test', cell_overwrite_ok=True)
    lists = [
        'id', 'comment_id', 'comment_cont', 'comment_screen_name', 'weibo_id',
        'user_id', 'create_time'
    ]
    i = 0
    for list in lists:
        sheet.write(0, i, list)
        i += 1
    infos = CommentOper.get_all_comment_by_weibo_id(4244968959004196)
    i = 1
    nowTime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    for info in infos:
        SeedidsOper.set_seed_id(info.user_id)
Ejemplo n.º 20
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 36
    retry_count = 1
    while cur_page <= 36:
        crawler.warning("current page {}".format(cur_page))

        url = HOME_URL.format(uid, cur_page)
        #if cur_page == 1:
        #    html = get_page(url, auth_level=1)
        #else:
        html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            if retry_count < 10:
                crawler.warning("user {} has no weibo, retry".format(uid))
                retry_count = retry_count + 1
                #time.sleep(240)
                continue;
            else:
                crawler.warning("user {} has no weibo, return".format(uid))
                return


         # Check whether weibo created after time in spider.yaml
        # timeafter = time.mktime(
        #     time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        # length_weibo_datas = len(weibo_datas)
        # for i in range(0, len(weibo_datas)):
        #     weibo_time = time.mktime(
        #         time.strptime(weibo_datas[i].create_time, '%Y-%m-%d %H:%M'))
        #     if weibo_time < timeafter:
        #         weibo_datas = weibo_datas[0:i]
        #         break

        WbDataOper.add_all(weibo_datas)

        # # If the weibo isn't created after the given time, jump out the loop
        # if i != length_weibo_datas - 1:
        #     break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        # if cur_page == 1:
        #     # here we use local call to get total page number
        #     total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
        #     auth_level = 1
        # else:
        auth_level = 2

        #if total_page < limit:
        #    limit = total_page

        crawler.warning("append tasks.home.crawl_ajax_page{}".format(uid));

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_0, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')

        app.send_task('tasks.home.crawl_ajax_page', args=(ajax_url_1, auth_level), queue='ajax_home_crawler',
                      routing_key='ajax_home_info')
        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Ejemplo n.º 21
0
def execute_relation_task():
    seeds = SeedidsOper.get_other_ids()
    if seeds:
        for seed in seeds:
            app.send_task('tasks.relation.crawl_follower_fans', args=(seed.uid,), queue='relation_crawler',
                          routing_key='for_relation_info')
Ejemplo n.º 22
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time() * 1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page,
                                     cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page,
                                     cur_page, cur_time + 100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Ejemplo n.º 23
0
def crawl_weibo_datas(uid):
    limit = get_max_home_page()
    cur_page = 1
    while cur_page <= limit:
        url = HOME_URL.format(uid, cur_page)
        if cur_page == 1:
            html = get_page(url, auth_level=1)
        else:
            html = get_page(url, auth_level=2)
        weibo_datas = get_data(html)

        if not weibo_datas:
            crawler.warning("user {} has no weibo".format(uid))
            return

        # Check whether weibo created after time in spider.yaml
        length_weibo_datas = len(weibo_datas)
        timeafter = time.mktime(
            time.strptime(get_time_after(), '%Y-%m-%d %H:%M:%S'))
        weibo_datas = [
            weibo_datum for weibo_datum in weibo_datas
            if determine(weibo_datum, timeafter)
        ]

        WbDataOper.add_all(weibo_datas)

        # If the weibo isn't created after the given time, jump out the loop
        if len(weibo_datas) != length_weibo_datas:
            break

        domain = public.get_userdomain(html)
        cur_time = int(time.time()*1000)
        ajax_url_0 = AJAX_URL.format(domain, 0, domain, uid, cur_page, cur_page, cur_time)
        ajax_url_1 = AJAX_URL.format(domain, 1, domain, uid, cur_page, cur_page, cur_time+100)

        if cur_page == 1:
            # here we use local call to get total page number
            total_page = get_total_page(crawl_ajax_page(ajax_url_1, 2))
            auth_level = 1

            if total_page < limit:
                limit = total_page

            # Since the second ajax of page 1 has already been crawled
            # in the code above and has been stored in databse,
            # we only have to crawl the first ajax of page 1
            crawl_ajax_page(ajax_url_0, auth_level)

        else:
            auth_level = 2

            # Still the same as before
        # if total_page != limit:
        #     limit = total_page
        #     crawler.warning("total pagenum is {}".format(total_page))
        crawl_ajax_page(ajax_url_0, auth_level)
        crawl_ajax_page(ajax_url_1, auth_level)

        cur_page += 1

    SeedidsOper.set_seed_home_crawled(uid)
Ejemplo n.º 24
0
def execute_user_task():
    seeds = SeedidsOper.get_seed_ids()
    if seeds:
        for seed in seeds:
            crawl_person_infos(seed.uid)
Ejemplo n.º 25
0
def execute_user_task():
    seeds = SeedidsOper.get_seed_ids()
    if seeds:
        for seed in seeds:
            app.send_task('tasks.user.crawl_person_infos', args=(seed.uid,), queue='user_crawler',
                          routing_key='for_user_info')