def test_parse_search_info(url, is_login, cookies, session): if is_login == 1: content = session.get(url).text assert len(search.get_search_info(content)) > 0 else: content = requests.get(url, cookies=cookies).text assert len(search.get_search_info(content)) > 0 time.sleep(REQUEST_INTERNAL)
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) #crawler.info(limit) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) #crawler.info(search_page) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in last turn'.format(keyword)) #continue return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def search_keyword(keyword, keyword_id): crawler.info('We are searching keyword "{}"'.format(keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # We need not crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) if rs: crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info('Keyword {} has been crawled in this turn'.format(keyword)) return
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < limit: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return else: insert_weibo_data(wb_data) insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('keyword {} has been crawled in this turn'.format(keyword)) return
def search_items_v2(keyword, keyword_id, date_item): search_time_list = [ "{}-{}:{}-{}".format(d, t, d, t + 2) for d, t in itertools.product([date_item], TIME_LIIT) ] for s_time in search_time_list: crawler.info('We are searching keyword "{}", {}'.format( keyword, s_time)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = MAX_URL.format(encode_keyword, cur_page, s_time) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=1, need_proxy=True) if "您可以尝试更换关键词,再次搜索" in search_page: break if not search_page: crawler.warning( 'No search result for keyword {}, the source page is {}'. format(keyword, search_page)) cur_page += 1 continue # return search_list = parse_search.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return # Because the search results are sorted by time, if any result has been stored in mysql, # We don't need to crawl the same keyword in this turn for wb_data in search_list: # print(wb_data) rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # todo incremental crawling using time if rs: crawler.info('Weibo {} has been crawled, skip it.'.format( wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) # todo: only add seed ids and remove this task app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info')
def test_add_search_cont(self): """ 测试批量添加微博信息 :return: """ from db.wb_data import insert_weibo_datas from page_parse import search with open('tests/search.html', encoding='utf-8') as f: cont = f.read() infos = search.get_search_info(cont) insert_weibo_datas(infos)
def test_get_search_info(self): """ 测试微博搜索结果页面解析功能 :return: """ from page_parse import search with open('tests/search.html', encoding='utf-8') as f: cont = f.read() infos = search.get_search_info(cont) self.assertEqual(len(infos), 20)
def search_one(keyword, session): url = 'http://s.weibo.com/weibo/' + keyword + '&Refer=STopic_box' search_page = get_page(url, session, headers) if search_page: search_list = search.get_search_info(search_page) for s in search_list: s.keyword = keyword s.mk_primary = '_'.join([str(s.mid), keyword]) add_search_cont(search_list) else: print('并未解析到搜索结果:{page}'.format(page=search_page))
def test_add_search_cont(self): """ 测试批量添加微博信息 :return: """ from db.wb_data import insert_weibo_datas from page_parse import search url = TEST_SERVER + 'search.html' resp = requests.get(url) resp.encoding = 'utf-8' cont = resp.text infos = search.get_search_info(cont) insert_weibo_datas(infos)
def test_get_search_info(self): """ 测试微博搜索结果页面解析功能 :return: """ from page_parse import search url = TEST_SERVER + 'search.html' resp = requests.get(url) resp.encoding = 'utf-8' cont = resp.text infos = search.get_search_info(cont) self.assertEqual(len(infos), 20)
def test_add_search_cont(self): """ 测试批量添加微博信息 :return: """ from db.wb_data import insert_weibo_datas from page_parse import search url = TEST_SERVER + 'search.html' resp = requests.get(url) resp.encoding = 'utf-8' cont = resp.text infos = search.get_search_info(cont) insert_weibo_datas(infos)
def test_get_search_info(self): """ 测试微博搜索结果页面解析功能 :return: """ from page_parse import search url = TEST_SERVER + 'search.html' resp = requests.get(url) resp.encoding = 'utf-8' cont = resp.text infos = search.get_search_info(cont) self.assertEqual(len(infos), 20)
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) if cur_page == 1: search_page = get_page(cur_url, auth_level=1) else: search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning( 'No result for keyword {}, the source page is {}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return else: WbDataOper.add_one(wb_data) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') if cur_page == 1: cur_page += 1 elif 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return
def search_keyword(row): cur_page = 1 keyword = row.keyword if row.startTime: startTime = row.startTime.strftime('%Y-%m-%d') url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}×cope=custom:{}' if row.endTime: endTime = row.endTime.strftime('%Y-%m-%d') encode_keyword = url_parse.quote(keyword) while cur_page < limit: if row.startTime and row.endTime: finalTime = startTime + ':' + endTime cur_url = url.format(encode_keyword, cur_page, finalTime) else: cur_url = url.format(encode_keyword, cur_page) search_page = get_page(cur_url) if not search_page: crawler.warning( '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环 for wb_data in search_list: rs = get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword)) return else: insert_weibo_data(wb_data) # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处 app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') # 判断是否包含下一页 if 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info('关键词{}搜索完成'.format(keyword)) return
def search_keyword(keyword, keyword_id): crawler.info('We are searching keyword "{}"'.format(keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info('Keyword {} has been crawled in this turn'.format(keyword)) return # Because the search results are sorted by time, if any result has been stored in mysql, # We don't need to crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # todo incremental crawling using time if rs: crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) # todo: only add seed ids and remove this task app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info')