def get_origin_info(bw_id, logger): try: time.sleep(5) url = 'https://m.weibo.cn/statuses/show?id=' + str(bw_id) r = requests.get(url, headers=get_header(), proxies=get_proxy()) r.raise_for_status() r.encoding = r.apparent_encoding content = json.loads(r.text) if content.get('ok') == 1: # 默认为原创 origin = True # 若包含被转发微博信息,判断为转发 if 'retweeted_status' in r.text: origin = False # 获取转发页数 rp_count = jsonpath(content, '$.data.reposts_count')[0] if rp_count > 0: rp_page = int(rp_count) / 10 + 1 else: rp_page = 0 # 获取被转发用户信息 origin_user = jsonpath(content, '$.data.user')[0] info_dict = { 'bw_id': bw_id, 'origin': origin, 'rp_count': rp_count, 'rp_page': rp_page, 'origin_user': origin_user } return info_dict else: return False except Exception as e: logger.error(f"Cannot get details of weibo {bw_id}. {e}")
def get_more_topic(query, epoch, topic_dir): topic_list = [] page_count = 0 # 获取返回的总页数 base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D38%26q%3D' + str(query) + '%26t%3D0&page_type=searchall' try: r = requests.get(base_url, headers=get_header(), proxies=get_proxy()) r.raise_for_status() page = json.loads(r.text)['data']['cardlistInfo']['total'] / 10 print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] EPOCH: {epoch}. Keyword: {query}. Get {page} pages of new topics.') except Exception: time.sleep(60) get_more_topic(query, epoch, topic_dir) while(page_count <= page): time.sleep(3) page_count += 1 this_url = base_url + '&page=' + str(page_count) print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Crawling Topic. Page {page_count} of keyword {query}') try: r = requests.get(this_url, headers=get_header(), proxies=get_proxy()) r.raise_for_status() r.encoding = r.apparent_encoding content = json.loads(r.text) if content['ok'] == 1: items = jsonpath(content, '$..card_group..title_sub') for item in items: temp = item.strip('#') if temp != query.strip(): topic_list.append([temp]) else: continue except Exception: print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Error happen in page --->" + str(page_count)') # 结果写入文件 with open(topic_dir + 'Topics_' + str(epoch) + '.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerows(topic_list) # 获取元素输出 print(f'[{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Finished Crawling Topic. Get {len(topic_list)} new topic for keyword {query}')
def get_query_info(wd, writer, logger, since_date=None): if_crawl = True page_count = 0 error = {} # 将检索词编码,嵌入url得到不同词的url字典 # 爬取检索页面下热门栏的页面 base_url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D60%26q%3D' + quote( wd) + '%26t%3D0&page_type=searchall' # 计算可获取的总页数 page = get_Page(wd, base_url, logger) # 获取包含检索词的相关微博 while (page_count <= page): result_list = [] page_count += 1 this_url = base_url + '&page=' + str(page_count) # logger.info(f'Page {page_count}: {this_url}') try: time.sleep(3) r = requests.get(this_url, headers=get_header(), proxies=get_proxy()) logger.info(f'Crawling Query. Page {page_count} of keyword {wd}') r.raise_for_status() r.encoding = r.apparent_encoding content = json.loads(r.text) if content.get('ok') == 1: mblogs = jsonpath(content, '$.data.cards..mblog') for mblog in mblogs: # 含有该键的mblog表示该条微博不是原创微博 if mblog.get('retweeted_status'): continue mblog['created_at'] = standardize_date(mblog['created_at']) this_topic, this_text = getText(mblog) this_dict = { 'keyword': str(wd), 'user_id': mblog['user']['id'], 'screen_name': mblog['user']['screen_name'], 'bw_id': mblog['id'], 'repost_count': mblog['reposts_count'], 'topic': this_topic, 'content': this_text, 'created_at': mblog['created_at'] } if since_date: since_date = datetime.strptime(since_date, '%Y-%m-%d') created_at = datetime.strptime(mblog['created_at'], '%Y-%m-%d') if (created_at > since_date): if_crawl = False else: if_crawl = False if not if_crawl: result_list.append(this_dict) # 将该页面符合规定时间的内容写入csv writer.write_csv(result_list) else: continue except Exception as e: # 若第一次错误,则将url加入error,并休息60s if error.get(this_url) is None: error[this_url] = 1 page_count -= 1 time.sleep(60) # 若第二次错误,则报错 else: logger.error(f'Page {page_count} failed. {e}')
def get_repost_info(center_bw_id, bw_id, level, writer, logger, temp_writer, since_date=None): error = {} idList = [] # 获取原博主信息 origin_info = get_origin_info(bw_id, logger) # 获取成功时: if origin_info: # 原创信息 origin = origin_info['origin'] # 用户信息 origin_user = origin_info['origin_user'] # 转发数和转发总页数 rp_count = origin_info['rp_count'] page = origin_info['rp_page'] # 可能出现微博删除或无法获取的情况,则不再获取该bw_id else: return None # 转发信息爬取 if page == 0: logger.info( f'Center bw : {center_bw_id}. level: {level}. No repost of this bw {bw_id}.' ) writer.write_csv(None, END=True, center_bw_id=center_bw_id, origin_info=origin_info, level=level) else: logger.info( f'Center bw : {center_bw_id}. Get {page} pages of bw {bw_id}.') base_url = 'https://m.weibo.cn/api/statuses/repostTimeline?id=' + str( bw_id) + '&page=' page_count = 0 while (page_count <= page): page_count += 1 result_list = [] try: time.sleep(7) this_url = base_url + str(page_count) logger.info( f'Center bw : {center_bw_id}. level: {level}. Crawling page {page_count} of bw {bw_id}.' ) r = requests.get(this_url, headers=get_header(), proxies=get_proxy()) r.raise_for_status() r.encoding = r.apparent_encoding content = json.loads(r.text) if content.get('ok') == 1: datas = jsonpath(content, '$.data.data.*') for data in datas: data['created_at'] = standardize_date( data['created_at']) flag = checkLevel(level, origin_user['screen_name'], data['raw_text']) if flag: this_dict = { 'center_bw_id': center_bw_id, 'user_id': origin_user['id'], 'screen_name': origin_user['screen_name'], 'bw_id': bw_id, 'origin': origin, 'repost_count': rp_count, 'fs_count': origin_user['followers_count'], 'fs_user_id': data['user']['id'], 'fs_screen_name': data['user']['screen_name'], 'fs_bw_id': data['id'], 'fs_fans_count': data['user']['followers_count'], 'level': level, 'raw_text': data['raw_text'], 'created_at': data['created_at'] } # 将待爬取id放入下一轮爬取的id列表(即其作为原博时) idList.append({'bw_id': data['id']}) # 判断是否是规定时间之后产生的微博 if since_date: since_date = datetime.strptime( since_date, '%Y-%m-%d') created_at = datetime.strptime( data['created_at'], '%Y-%m-%d') if (created_at > since_date): if_crawl = False else: if_crawl = False if not if_crawl: result_list.append(this_dict) else: continue # 将符合规定时间的内容写入csv writer.write_csv(result_list) else: continue except Exception as e: if error.get(this_url) is None: error[this_url] = 1 page_count -= 1 time.sleep(60) else: logger.error( f"Cannot get page {page_count} of bw {bw_id}. {e}") # 爬取完所有页数,将idList写入对应的level文件 if idList: temp_writer.write_csv(idList)
def get_Page(wd, base_url, logger): r = requests.get(base_url, headers=get_header(), proxies=get_proxy()) r.raise_for_status() page = json.loads(r.text)['data']['cardlistInfo']['total'] / 10 + 1 logger.info(f'Keyword: {wd}. Get {page} pages of returned weibo.') return page