Example #1
0
def get_origin_info(bw_id, logger):
    try:
        time.sleep(5)
        url = 'https://m.weibo.cn/statuses/show?id=' + str(bw_id)
        r = requests.get(url, headers=get_header(), proxies=get_proxy())
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        content = json.loads(r.text)
        if content.get('ok') == 1:
            # 默认为原创
            origin = True
            # 若包含被转发微博信息,判断为转发
            if 'retweeted_status' in r.text:
                origin = False
            # 获取转发页数
            rp_count = jsonpath(content, '$.data.reposts_count')[0]
            if rp_count > 0:
                rp_page = int(rp_count) / 10 + 1
            else:
                rp_page = 0
            # 获取被转发用户信息
            origin_user = jsonpath(content, '$.data.user')[0]
            info_dict = {
                'bw_id': bw_id,
                'origin': origin,
                'rp_count': rp_count,
                'rp_page': rp_page,
                'origin_user': origin_user
            }
            return info_dict
        else:
            return False
    except Exception as e:
        logger.error(f"Cannot get details of weibo {bw_id}. {e}")
Example #2
0
def get_more_topic(query, epoch, topic_dir):
    topic_list = []
    page_count = 0
    # 获取返回的总页数
    base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D38%26q%3D' + str(query) + '%26t%3D0&page_type=searchall'
    try:
        r = requests.get(base_url, headers=get_header(), proxies=get_proxy())
        r.raise_for_status()
        page = json.loads(r.text)['data']['cardlistInfo']['total'] / 10
        print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  EPOCH: {epoch}. Keyword: {query}. Get {page} pages of new topics.')
    except Exception:
        time.sleep(60)
        get_more_topic(query, epoch, topic_dir)
    while(page_count <= page):
        time.sleep(3)
        page_count += 1
        this_url = base_url + '&page=' + str(page_count)
        print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Crawling Topic. Page {page_count} of keyword {query}')
        try:
            r = requests.get(this_url, headers=get_header(), proxies=get_proxy())
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            content = json.loads(r.text)
            if content['ok'] == 1:
                items = jsonpath(content, '$..card_group..title_sub')
                for item in items:
                    temp = item.strip('#')
                    if temp != query.strip():
                        topic_list.append([temp])
            else:
                continue
        except Exception:
            print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Error happen in page --->" + str(page_count)')

    # 结果写入文件
    with open(topic_dir + 'Topics_' + str(epoch) + '.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(topic_list)

    # 获取元素输出
    print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}]  Finished Crawling Topic. Get {len(topic_list)} new topic for keyword {query}')
def get_query_info(wd, writer, logger, since_date=None):
    if_crawl = True
    page_count = 0
    error = {}
    # 将检索词编码,嵌入url得到不同词的url字典
    # 爬取检索页面下热门栏的页面
    base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D60%26q%3D' + quote(
        wd) + '%26t%3D0&page_type=searchall'
    # 计算可获取的总页数
    page = get_Page(wd, base_url, logger)
    # 获取包含检索词的相关微博
    while (page_count <= page):
        result_list = []
        page_count += 1
        this_url = base_url + '&page=' + str(page_count)
        # logger.info(f'Page {page_count}: {this_url}')
        try:
            time.sleep(3)
            r = requests.get(this_url,
                             headers=get_header(),
                             proxies=get_proxy())
            logger.info(f'Crawling Query. Page {page_count} of keyword {wd}')
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            content = json.loads(r.text)
            if content.get('ok') == 1:
                mblogs = jsonpath(content, '$.data.cards..mblog')
                for mblog in mblogs:
                    # 含有该键的mblog表示该条微博不是原创微博
                    if mblog.get('retweeted_status'):
                        continue
                    mblog['created_at'] = standardize_date(mblog['created_at'])
                    this_topic, this_text = getText(mblog)
                    this_dict = {
                        'keyword': str(wd),
                        'user_id': mblog['user']['id'],
                        'screen_name': mblog['user']['screen_name'],
                        'bw_id': mblog['id'],
                        'repost_count': mblog['reposts_count'],
                        'topic': this_topic,
                        'content': this_text,
                        'created_at': mblog['created_at']
                    }
                    if since_date:
                        since_date = datetime.strptime(since_date, '%Y-%m-%d')
                        created_at = datetime.strptime(mblog['created_at'],
                                                       '%Y-%m-%d')
                        if (created_at > since_date):
                            if_crawl = False
                    else:
                        if_crawl = False
                    if not if_crawl:
                        result_list.append(this_dict)
                # 将该页面符合规定时间的内容写入csv
                writer.write_csv(result_list)
            else:
                continue
        except Exception as e:
            # 若第一次错误,则将url加入error,并休息60s
            if error.get(this_url) is None:
                error[this_url] = 1
                page_count -= 1
                time.sleep(60)
            # 若第二次错误,则报错
            else:
                logger.error(f'Page {page_count} failed. {e}')
Example #4
0
def get_repost_info(center_bw_id,
                    bw_id,
                    level,
                    writer,
                    logger,
                    temp_writer,
                    since_date=None):
    error = {}
    idList = []
    # 获取原博主信息
    origin_info = get_origin_info(bw_id, logger)
    # 获取成功时:
    if origin_info:
        # 原创信息
        origin = origin_info['origin']
        # 用户信息
        origin_user = origin_info['origin_user']
        # 转发数和转发总页数
        rp_count = origin_info['rp_count']
        page = origin_info['rp_page']
    # 可能出现微博删除或无法获取的情况,则不再获取该bw_id
    else:
        return None
    # 转发信息爬取
    if page == 0:
        logger.info(
            f'Center bw : {center_bw_id}. level: {level}. No repost of this bw {bw_id}.'
        )
        writer.write_csv(None,
                         END=True,
                         center_bw_id=center_bw_id,
                         origin_info=origin_info,
                         level=level)
    else:
        logger.info(
            f'Center bw : {center_bw_id}. Get {page} pages of bw {bw_id}.')
        base_url = 'https://m.weibo.cn/api/statuses/repostTimeline?id=' + str(
            bw_id) + '&page='
        page_count = 0
        while (page_count <= page):
            page_count += 1
            result_list = []
            try:
                time.sleep(7)
                this_url = base_url + str(page_count)
                logger.info(
                    f'Center bw : {center_bw_id}. level: {level}. Crawling page {page_count} of bw {bw_id}.'
                )
                r = requests.get(this_url,
                                 headers=get_header(),
                                 proxies=get_proxy())
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                content = json.loads(r.text)
                if content.get('ok') == 1:
                    datas = jsonpath(content, '$.data.data.*')
                    for data in datas:
                        data['created_at'] = standardize_date(
                            data['created_at'])
                        flag = checkLevel(level, origin_user['screen_name'],
                                          data['raw_text'])
                        if flag:
                            this_dict = {
                                'center_bw_id': center_bw_id,
                                'user_id': origin_user['id'],
                                'screen_name': origin_user['screen_name'],
                                'bw_id': bw_id,
                                'origin': origin,
                                'repost_count': rp_count,
                                'fs_count': origin_user['followers_count'],
                                'fs_user_id': data['user']['id'],
                                'fs_screen_name': data['user']['screen_name'],
                                'fs_bw_id': data['id'],
                                'fs_fans_count':
                                data['user']['followers_count'],
                                'level': level,
                                'raw_text': data['raw_text'],
                                'created_at': data['created_at']
                            }
                            # 将待爬取id放入下一轮爬取的id列表(即其作为原博时)
                            idList.append({'bw_id': data['id']})
                            # 判断是否是规定时间之后产生的微博
                            if since_date:
                                since_date = datetime.strptime(
                                    since_date, '%Y-%m-%d')
                                created_at = datetime.strptime(
                                    data['created_at'], '%Y-%m-%d')
                                if (created_at > since_date):
                                    if_crawl = False
                            else:
                                if_crawl = False
                            if not if_crawl:
                                result_list.append(this_dict)
                        else:
                            continue
                    # 将符合规定时间的内容写入csv
                    writer.write_csv(result_list)
                else:
                    continue
            except Exception as e:
                if error.get(this_url) is None:
                    error[this_url] = 1
                    page_count -= 1
                    time.sleep(60)
                else:
                    logger.error(
                        f"Cannot get page {page_count} of bw {bw_id}. {e}")
        # 爬取完所有页数,将idList写入对应的level文件
        if idList:
            temp_writer.write_csv(idList)
def get_Page(wd, base_url, logger):
    r = requests.get(base_url, headers=get_header(), proxies=get_proxy())
    r.raise_for_status()
    page = json.loads(r.text)['data']['cardlistInfo']['total'] / 10 + 1
    logger.info(f'Keyword: {wd}. Get {page} pages of returned weibo.')
    return page