Esempio n. 1
0
def test_parse_search_info(url, is_login, cookies, session):
    if is_login == 1:
        content = session.get(url).text
        assert len(search.get_search_info(content)) > 0
    else:
        content = requests.get(url, cookies=cookies).text
        assert len(search.get_search_info(content)) > 0
    time.sleep(REQUEST_INTERNAL)
Esempio n. 2
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(
                keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Esempio n. 3
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    #crawler.info(limit)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        #crawler.info(search_page)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in last turn'.format(keyword))
                #continue
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Esempio n. 4
0
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)

            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return
Esempio n. 5
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        cur_url = url.format(encode_keyword, cur_page)

        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning('No result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                insert_keyword_wbid(keyword_id, wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('keyword {} has been crawled in this turn'.format(keyword))
            return
Esempio n. 6
0
def search_items_v2(keyword, keyword_id, date_item):
    search_time_list = [
        "{}-{}:{}-{}".format(d, t, d, t + 2)
        for d, t in itertools.product([date_item], TIME_LIIT)
    ]

    for s_time in search_time_list:
        crawler.info('We are searching keyword "{}", {}'.format(
            keyword, s_time))
        cur_page = 1
        encode_keyword = url_parse.quote(keyword)
        while cur_page < LIMIT:
            cur_url = MAX_URL.format(encode_keyword, cur_page, s_time)
            # current only for login, maybe later crawling page one without login
            search_page = get_page(cur_url, auth_level=1, need_proxy=True)
            if "您可以尝试更换关键词,再次搜索" in search_page:
                break
            if not search_page:
                crawler.warning(
                    'No search result for keyword {}, the source page is {}'.
                    format(keyword, search_page))
                cur_page += 1
                continue
                # return

            search_list = parse_search.get_search_info(search_page)

            if cur_page == 1:
                cur_page += 1
            elif 'noresult_tit' not in search_page:
                cur_page += 1
            else:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return

            # Because the search results are sorted by time, if any result has been stored in mysql,
            # We don't need to crawl the same keyword in this turn
            for wb_data in search_list:
                # print(wb_data)
                rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # todo incremental crawling using time
                if rs:
                    crawler.info('Weibo {} has been crawled, skip it.'.format(
                        wb_data.weibo_id))
                    continue
                else:
                    WbDataOper.add_one(wb_data)
                    # todo: only add seed ids and remove this task
                    app.send_task('tasks.user.crawl_person_infos',
                                  args=(wb_data.uid, ),
                                  queue='user_crawler',
                                  routing_key='for_user_info')
Esempio n. 7
0
 def test_add_search_cont(self):
     """
     测试批量添加微博信息
     :return: 
     """
     from db.wb_data import insert_weibo_datas
     from page_parse import search
     with open('tests/search.html', encoding='utf-8') as f:
         cont = f.read()
     infos = search.get_search_info(cont)
     insert_weibo_datas(infos)
Esempio n. 8
0
    def test_get_search_info(self):
        """
        测试微博搜索结果页面解析功能
        :return: 
        """
        from page_parse import search
        with open('tests/search.html', encoding='utf-8') as f:
            cont = f.read()
        infos = search.get_search_info(cont)

        self.assertEqual(len(infos), 20)
Esempio n. 9
0
def search_one(keyword, session):
    url = 'http://s.weibo.com/weibo/' + keyword + '&Refer=STopic_box'
    search_page = get_page(url, session, headers)
    if search_page:
        search_list = search.get_search_info(search_page)
        for s in search_list:
            s.keyword = keyword
            s.mk_primary = '_'.join([str(s.mid), keyword])
        add_search_cont(search_list)
    else:
        print('并未解析到搜索结果:{page}'.format(page=search_page))
Esempio n. 10
0
 def test_add_search_cont(self):
     """
     测试批量添加微博信息
     :return: 
     """
     from db.wb_data import insert_weibo_datas
     from page_parse import search
     url = TEST_SERVER + 'search.html'
     resp = requests.get(url)
     resp.encoding = 'utf-8'
     cont = resp.text
     infos = search.get_search_info(cont)
     insert_weibo_datas(infos)
Esempio n. 11
0
    def test_get_search_info(self):
        """
        测试微博搜索结果页面解析功能
        :return: 
        """
        from page_parse import search
        url = TEST_SERVER + 'search.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        cont = resp.text
        infos = search.get_search_info(cont)

        self.assertEqual(len(infos), 20)
Esempio n. 12
0
 def test_add_search_cont(self):
     """
     测试批量添加微博信息
     :return: 
     """
     from db.wb_data import insert_weibo_datas
     from page_parse import search
     url = TEST_SERVER + 'search.html'
     resp = requests.get(url)
     resp.encoding = 'utf-8'
     cont = resp.text
     infos = search.get_search_info(cont)
     insert_weibo_datas(infos)
Esempio n. 13
0
    def test_get_search_info(self):
        """
        测试微博搜索结果页面解析功能
        :return: 
        """
        from page_parse import search
        url = TEST_SERVER + 'search.html'
        resp = requests.get(url)
        resp.encoding = 'utf-8'
        cont = resp.text
        infos = search.get_search_info(cont)

        self.assertEqual(len(infos), 20)
Esempio n. 14
0
def search_keyword(keyword, keyword_id):
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        if cur_page == 1:
            search_page = get_page(cur_url, auth_level=1)
        else:
            search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning(
                'No result for keyword {}, the source page is {}'.format(
                    keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # we need not crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info(
                    'Keyword {} has been crawled in this turn'.format(keyword))
                return
            else:
                WbDataOper.add_one(wb_data)
                KeywordsDataOper.insert_keyword_wbid(keyword_id,
                                                     wb_data.weibo_id)
                # send task for crawling user info
                app.send_task('tasks.user.crawl_person_infos',
                              args=(wb_data.uid, ),
                              queue='user_crawler',
                              routing_key='for_user_info')
        if cur_page == 1:
            cur_page += 1
        elif 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info(
                'Keyword {} has been crawled in this turn'.format(keyword))
            return
Esempio n. 15
0
def search_keyword(row):
    cur_page = 1
    keyword = row.keyword
    if row.startTime:
        startTime = row.startTime.strftime('%Y-%m-%d')
        url = 'http://s.weibo.com/weibo/{}&scope=ori&suball=1&page={}&timescope=custom:{}'
    if row.endTime:
        endTime = row.endTime.strftime('%Y-%m-%d')
    encode_keyword = url_parse.quote(keyword)
    while cur_page < limit:
        if row.startTime and row.endTime:
            finalTime = startTime + ':' + endTime
            cur_url = url.format(encode_keyword, cur_page, finalTime)
        else:
            cur_url = url.format(encode_keyword, cur_page)
        search_page = get_page(cur_url)
        if not search_page:
            crawler.warning(
                '本次并没获取到关键词{}的相关微博,该页面源码是{}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)
        # 先判断数据库里是否存在相关的微博,如果是已有的,那就说明是已经抓取的微博(因为结果默认按时间排序),就退出循环
        for wb_data in search_list:
            rs = get_wb_by_mid(wb_data.weibo_id)
            if rs:
                crawler.info('关键词{}本次搜索更新的微博已经获取完成'.format(keyword))
                return
            else:
                insert_weibo_data(wb_data)
                # 这里暂时使用网络调用而非本地调用,权衡两种方法的好处
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')

        # 判断是否包含下一页
        if 'page next S_txt1 S_line1' in search_page:
            cur_page += 1
        else:
            crawler.info('关键词{}搜索完成'.format(keyword))
            return
Esempio n. 16
0
def search_keyword(keyword, keyword_id):
    crawler.info('We are searching keyword "{}"'.format(keyword))
    cur_page = 1
    encode_keyword = url_parse.quote(keyword)
    while cur_page < LIMIT:
        cur_url = URL.format(encode_keyword, cur_page)
        # current only for login, maybe later crawling page one without login
        search_page = get_page(cur_url, auth_level=2)
        if not search_page:
            crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page))
            return

        search_list = parse_search.get_search_info(search_page)

        if cur_page == 1:
            cur_page += 1
        elif 'noresult_tit' not in search_page:
            cur_page += 1
        else:
            crawler.info('Keyword {} has been crawled in this turn'.format(keyword))
            return

        # Because the search results are sorted by time, if any result has been stored in mysql,
        # We don't need to crawl the same keyword in this turn
        for wb_data in search_list:
            rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id)
            KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id)
            # todo incremental crawling using time
            if rs:
                crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id))
                continue
            else:
                WbDataOper.add_one(wb_data)
                # todo: only add seed ids and remove this task
                app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler',
                              routing_key='for_user_info')