コード例 #1
0
        def parse_static_img_url():
            static_img_url = parse_field(
                parser=self.parser_obj['video_info']['static_img_url'],
                target_obj=body,
                logger=self.lg,
            )

            if short_name == 'n15':
                static_img_url = 'https:' + static_img_url if static_img_url != '' else ''

            elif short_name == '8xs':
                if static_img_url != '':
                    # eg: 'db9b743f134910dbc697fc9f1513428c/index.m3u8' or '8e774a4bf7a697dd00935023a401eec8/v.m3u8'
                    video_id_sel = {
                        'method': 're',
                        'selector': '(\w+)/\w+\.m3u8',
                    }
                    # 从m3u8地址中生成静态图
                    video_id = parse_field(
                        parser=video_id_sel,
                        target_obj=static_img_url,
                        logger=self.lg,
                    )
                    static_img_url = 'https://8xcha.com/p/{}.jpg'.format(video_id)\
                        if video_id != '' else ''

            else:
                pass

            return static_img_url
コード例 #2
0
        def parse_body() -> dict:
            """
            解析
            :return:
            """
            nonlocal body

            # 多规格的最低价
            # 会员价yx 部分显示错误, 改用big 价格加上优点, 两个一起用
            tb_price_sel = {
                'method': 'css',
                'selector': 'div.goodsPriceTips span:nth-child(2) ::text',
            }
            big_price_sel = {
                'method': 'css',
                'selector': 'div.goodsPrice big ::text',
            }
            yd_sel = {
                'method': 're',
                'selector': '<span class=\"yiudianPrice\">\+(\d+)优点</span>'
            }
            # 根据sharetitle的描述价, 但是由于没有cookie就会显示的是新人价(所以意义不大)
            normal_price_sel = {
                'method': 're',
                # 'selector': '我在.*?(\d+\.\d+)元抢到这个超值商品',
                'selector': 'var goodsmemberPrice = \'(.*?)\';',
            }
            tb_price0 = parse_field(
                parser=tb_price_sel,
                target_obj=body,
            )
            assert tb_price0 != ''
            big_price = parse_field(
                parser=big_price_sel,
                target_obj=body,
            )
            assert big_price != ''
            yd = parse_field(
                parser=yd_sel,
                target_obj=body,
            )
            assert yd != ''
            normal_price = parse_field(
                parser=normal_price_sel,
                target_obj=body,
            )
            assert normal_price != ''
            # 会员价
            tb_price0 = float(tb_price0).__round__(2)
            # 优点价
            tb_price1 = (float(big_price) + float(yd) / 100).__round__(2)
            # 常规价
            tb_price2 = float(normal_price).__round__(2)

            return {
                'goods_id': goods_id,
                'tb_price0': tb_price0,
                'tb_price1': tb_price1,
                'tb_price2': tb_price2,
            }
コード例 #3
0
        def parse(ok_video_id, body) -> dict:
            """
            解析
            :param body:
            :return:
            """
            m3u8_li_sel = {
                'method': 'css',
                'selector': 'div[id="2"] ul li ::text',
            }
            before_info_sel = {
                'method': 're',
                'selector': '(.*)\$',
            }
            video_play_url_sel = {
                'method': 're',
                'selector': '\$(.*)',
            }
            m3u8_li = parse_field(
                parser=m3u8_li_sel,
                target_obj=body,
                is_print_error=False,
                is_first=False,
            )
            # pprint(m3u8_li)
            new_m3u8_li = []
            for item in m3u8_li:
                try:
                    before_info = parse_field(
                        parser=before_info_sel,
                        target_obj=item,
                        is_print_error=False,
                    )
                    # before_info可为空值
                    video_play_url = parse_field(
                        parser=video_play_url_sel,
                        target_obj=item,
                        is_print_error=False,
                    )
                    assert video_play_url != ''
                except AssertionError as e:
                    # print(e)
                    continue
                new_m3u8_li.append({
                    'before_info': before_info,
                    'video_play_url': video_play_url,
                })

            print('[{}] ok_video_id: {}'.format(
                '+' if new_m3u8_li != [] else '-',
                ok_video_id,
            ))

            return {
                'ok_video_id': ok_video_id,
                'm3u8_li': new_m3u8_li,
            }
コード例 #4
0
ファイル: proxy_tasks.py プロジェクト: guanbinbin/fz_ip_pool
        def _get_port(**kwargs) -> str:
            tr = kwargs['tr']
            port_selector = kwargs['port_selector']

            port = parse_field(parser=port_selector, target_obj=tr)
            assert port != '', 'port为空值!'

            return port
コード例 #5
0
        def parse_body() -> dict:
            """
            解析
            :return:
            """
            nonlocal body

            # 多规格的最低价
            # 会员价yx 部分显示错误, 改用big 价格加上优点, 两个一起用
            tb_price_sel = {
                'method': 'css',
                'selector': 'div.goodsPriceTips span:nth-child(2) ::text',
            }
            big_price_sel = {
                'method': 'css',
                'selector': 'div.goodsPrice big ::text',
            }
            yd_sel = {
                'method': 're',
                'selector': '<span class=\"yiudianPrice\">\+(\d+)优点</span>'
            }
            tb_price0 = parse_field(
                parser=tb_price_sel,
                target_obj=body,
            )
            assert tb_price0 != ''
            big_price = parse_field(
                parser=big_price_sel,
                target_obj=body,
            )
            assert big_price != ''
            yd = parse_field(
                parser=yd_sel,
                target_obj=body,
            )
            assert yd != ''
            # 会员价
            tb_price0 = float(tb_price0).__round__(2)
            # 优点价
            tb_price1 = (float(big_price) + float(yd) / 100).__round__(2)

            return {
                'goods_id': goods_id,
                'tb_price0': tb_price0,
                'tb_price1': tb_price1,
            }
コード例 #6
0
    async def _get_all_brand_name_and_brand_id(self) -> list:
        """
        获取所有品牌名及其对应的brand_id
        :return:
        """
        headers = self.get_random_phone_headers()
        headers.update({
            'authority': 'car.m.autohome.com.cn',
            'referer': 'https://car.autohome.com.cn/',
        })
        url = 'https://car.m.autohome.com.cn/'
        body = await unblock_request(
            url=url,
            headers=headers,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.num_retries,
        )
        brand_name_selector = {
            'method': 'css',
            'selector': 'div#div_ListBrand ul li div span ::text',
        }
        brand_id_selector = {
            'method': 'css',
            'selector': 'div#div_ListBrand ul li div ::attr("v")',
        }
        brand_name_list = parse_field(
            parser=brand_name_selector,
            target_obj=body,
            is_first=False,
        )
        brand_id_list = parse_field(
            parser=brand_id_selector,
            target_obj=body,
            is_first=False,
        )
        tmp_brand_info_list = list(zip(brand_name_list, brand_id_list))
        # pprint(tmp_brand_info_list)

        brand_info_list = [{
            'brand_name': item[0],
            'brand_id': item[1],
        } for item in tmp_brand_info_list]
        # pprint(brand_info_list)

        return brand_info_list
コード例 #7
0
def _get_ng_one_type_company_id_list_task(self,
                                          ip_pool_type,
                                          keyword,
                                          page_num,
                                          company_item_id_selector,
                                          num_retries=8,
                                          timeout=15) -> list:
    """
    获取ng单个keyword的某个页面num对应的所有company_id list(m站搜索)
    :param self:
    :param ip_pool_type:
    :param keyword:
    :param page_num:
    :param num_retries:
    :param timeout:
    :return:
    """
    headers = get_random_headers(
        user_agent_type=1,
        connection_status_keep_alive=True,)
    headers.update({
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'accept': '*/*',
        # 'Referer': 'http://m.nanguo.cn/search/?q=%E6%88%91&l=zh-CN',
        'X-Requested-With': 'XMLHttpRequest',
    })
    params = (
        ('q', str(keyword)),
        ('l', 'zh-CN'),
        ('loadmore', 'true'),
        ('p', str(page_num)),
    )
    url = 'http://m.nanguo.cn/search/index/'
    body = Requests.get_url_body(
        url=url,
        headers=headers,
        params=params,
        ip_pool_type=ip_pool_type,
        num_retries=num_retries,
        timeout=timeout,)
    # lg.info(body)
    company_item_id_list = list(set(parse_field(
        parser=company_item_id_selector,
        target_obj=body,
        is_first=False,
        logger=lg)))
    # pprint(company_item_list)
    company_item_list = [{
        'company_id': item,
    } for item in company_item_id_list]
    lg.info('[{}] keyword: {}, page_num: {}'.format(
        '+' if company_item_list != [] else '-',
        keyword,
        page_num,))
    collect()

    return company_item_list
コード例 #8
0
        def parse_video_name():
            video_name = parse_field(
                parser=self.parser_obj['video_info']['video_name'],
                target_obj=body,
                logger=self.lg,
            )
            assert video_name != ''

            return video_name
コード例 #9
0
ファイル: utils.py プロジェクト: Pointvar/python
def _get_ip(parser, target_obj) -> str:
    """
    获取ip address
    :return:
    """
    ip = parse_field(parser=parser, target_obj=target_obj)
    assert ip != '', '获取到的ip为空值!'

    return ip
コード例 #10
0
ファイル: proxy_tasks.py プロジェクト: guanbinbin/fz_ip_pool
        def _get_ip_type(**kwargs) -> str:
            '''获取ip_type'''
            tr = kwargs['tr']
            ip_type_selector = kwargs['ip_type_selector']

            ip_type = parse_field(parser=ip_type_selector, target_obj=tr)
            # 可空
            # assert ip_type != '', 'ip_type为空值!'
            # return 'http' if ip_type == 'HTTP' else 'https'

            return 'http'  # 全部返回'http'
コード例 #11
0
ファイル: utils.py プロジェクト: Pointvar/python
def _get_port(parser, target_obj):
    """
    获取ip的端口
    :param parser:
    :param target_obj:
    :return:
    """
    port = parse_field(parser=parser, target_obj=target_obj)
    assert port != '', '获取到的port为空值!'

    return port
コード例 #12
0
        def parse_dislike_num():
            dislike_num = parse_field(
                parser=self.parser_obj['video_info']['dislike_num'],
                target_obj=body,
                logger=self.lg,
            )

            try:
                dislike_num = int(dislike_num)
            except:
                dislike_num = 0

            return dislike_num
コード例 #13
0
        def parse_collected_num():
            collected_num = parse_field(
                parser=self.parser_obj['video_info']['collected_num'],
                target_obj=body,
                logger=self.lg,
            )

            try:
                collected_num = int(collected_num)
            except:
                collected_num = 0

            return collected_num
コード例 #14
0
        def parse_like_num():
            like_num = parse_field(
                parser=self.parser_obj['video_info']['like_num'],
                target_obj=body,
                logger=self.lg,
            )

            try:
                like_num = int(like_num)
            except:
                like_num = 0

            return like_num
コード例 #15
0
ファイル: ealover_spider.py プロジェクト: yfeng2018/python-1
        def parse(body) -> list:
            title_sel = {
                'method': 'css',
                'selector': 'p.list_tit a ::text',
            }
            url_sel = {
                'method': 'css',
                'selector': 'p.list_tit a ::attr("href")',
            }
            desc_sel = {
                'method': 'css',
                'selector': 'p.desc.left ::text',
            }
            title_list = parse_field(
                parser=title_sel,
                target_obj=body,
                is_first=False,
            )
            # pprint(title_list)
            url_list = parse_field(
                parser=url_sel,
                target_obj=body,
                is_first=False,
            )
            # pprint(url_list)
            desc_list = parse_field(
                parser=desc_sel,
                target_obj=body,
                is_first=False,
            )
            # pprint(desc_list)
            _ = list(zip(title_list, url_list, desc_list))
            # pprint(_)

            return [{
                'title': item[0],
                'url': item[1],
                'desc': item[2],
            } for item in _]
コード例 #16
0
ファイル: utils.py プロジェクト: yfeng2018/python-1
def unblock_judge_ip_is_anonymity(ip_address='',
                                  port=0,
                                  httpbin=True,
                                  use_proxy=True,
                                  timeout=10,
                                  logger=None,) -> str:
    """
    阻塞返回当前的ip地址
    :param ip_address:
    :param port:
    :param httpbin:
    :param use_proxy:
    :param timeout:
    :return:
    """
    def _get_proxies():
        return {
            # 暴露原地址
            # 'http': ip_address + ':' + str(port),
            'https': ip_address + ':' + str(port),
        }

    url = 'https://www.whatismybrowser.com/' if not httpbin else 'https://www.httpbin.org/get'
    headers = get_random_headers(user_agent_type=1,)
    proxies = _get_proxies() if use_proxy else {}
    body = Requests.get_url_body(
        url=url,
        headers=headers,
        use_proxy=use_proxy,
        proxies=proxies,
        timeout=timeout,
        verify=False,)
    # print(body)

    if not httpbin:
        now_ip_selector = {
            'method': 'css',
            # 'selector': 'div#ip-address:nth-child(2) .detected-column a:nth-child(1) ::text',
            'selector': 'div#ip-address.detection-block .detected-column a:nth-child(1) ::text',
        }
        now_ip = parse_field(
            parser=now_ip_selector,
            target_obj=body,
            is_first=True,)

    else:
        now_ip = json_2_dict(
            json_str=body,
            default_res={},).get('origin', '')

    return now_ip
コード例 #17
0
ファイル: ali_1688_parse.py プロジェクト: devyru/python
        def add_div_desc(data1) -> dict:
            # 增加div_desc
            div_desc_selector = {
                'method': 're',
                'selector': '\"detailUrl\": \"(.*?)\"}<',
            }
            div_desc_url = parse_field(parser=div_desc_selector,
                                       target_obj=body,
                                       logger=self.lg)
            # self.lg.info(div_desc_url)
            assert div_desc_url != '', 'div_desc_url为空值!'
            data1.update({'detailUrl': div_desc_url})

            return data1
コード例 #18
0
ファイル: utils.py プロジェクト: Pointvar/python
def _get_ori_proxy_list(parser, target_obj) -> list:
    """
    获取origin proxy_list地址
    :param parser:
    :param target_obj:
    :return:
    """
    # print(target_obj)
    proxy_list = parse_field(
        parser=parser,
        target_obj=target_obj)
    assert  proxy_list != [], 'proxy_list为空list!'

    return proxy_list
コード例 #19
0
ファイル: proxy_tasks.py プロジェクト: guanbinbin/fz_ip_pool
        def _get_ip(**kwargs) -> str:
            tr = kwargs['tr']
            ip_selector = kwargs['ip_selector']

            ip = parse_field(parser=ip_selector, target_obj=tr)
            assert ip != '', 'ip为空值!'
            ip = re.compile(r'<script .*?</script>').sub('', ip)
            if re.compile('\d+').findall(ip) == []:  # 处理不是ip地址
                raise NotIpException

            lg.info(str(ip))
            ip = re.compile('\d+\.\d+\.\d+\.\d+').findall(ip)[0]
            assert ip != '', 'ip为空值!'

            return ip
コード例 #20
0
    def _get_replenishment_status(self, goods_id, body) -> bool:
        """
        获取某goods_id是否为缺货状态!
        :param goods_id:
        :return: True 缺货状态
        """
        # 判断是否下架 or 缺货
        is_replenishment_status_text_sel = {
            'method': 're',
            'selector': 'bfd_stock: (\d+) ,',
        }
        is_replenishment_status_text = parse_field(
            parser=is_replenishment_status_text_sel,
            target_obj=body,
            is_first=True,
            is_print_error=False,
        )
        is_replenishment_status_text_sel2 = {
            'method': 'css',
            'selector': 'div.btn_disabled ::text',
        }
        is_replenishment_status_text2 = parse_field(
            parser=is_replenishment_status_text_sel2,
            target_obj=body,
            is_first=True,
            is_print_error=False,
        )
        # print(is_replenishment_status_text)
        # print(is_replenishment_status_text2)
        if is_replenishment_status_text == '0'\
                or is_replenishment_status_text2 == '商品已经下架了~':
            # 有货为'1', 补货中为'0'
            return True

        else:
            return False
コード例 #21
0
        def add_all_img_list(data1) -> dict:
            # 增加示例图
            all_img_list_selector = {
                'method': 'css',
                'selector': 'div#J_Detail_ImageSlides div.swipe-pane img ::attr("swipe-lazy-src")',
            }
            all_img_list = parse_field(parser=all_img_list_selector, target_obj=body, logger=self.lg, is_first=False)
            assert all_img_list != [], 'all_img_list不为空list!'
            all_img_list = [{
                'originalImageURI': i
            } for i in all_img_list]
            data1.update({
                'imageList': all_img_list
            })

            return data1
コード例 #22
0
    def judge_qyh_is_tb_by_goods_id(self, goods_id):
        """
        根据商品id 判断是否是tb商品
        :param goods_id:
        :return: 0 tb|1 tm | -1 未知
        """
        headers = get_random_headers(
            connection_status_keep_alive=False,
            cache_control='',
        )
        headers.update({
            'authority': 'www.quanyoubuy.com',
        })
        url = 'https://www.quanyoubuy.com/item/index/iid/{}.html'.format(goods_id)
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            ip_pool_type=self.ip_pool_type,
            proxy_type=PROXY_TYPE_HTTPS,
            num_retries=7,)
        assert body != ''

        btn_text_sel = {
            'method': 'css',
            'selector': 'div.product-info a.go_btn span ::text',
        }
        btn_text = parse_field(
            parser=btn_text_sel,
            target_obj=body,
            is_print_error=False,
            logger=self.lg,
        )
        # self.lg.info(btn_text)
        assert btn_text != ''

        res = -1
        if '天猫' in btn_text:
            self.lg.info('goods_id: {}, tm good'.format(goods_id))
            res = 1
        elif '淘宝' in btn_text:
            self.lg.info('goods_id: {}, tb good'.format(goods_id))
            res = 0
        else:
            self.lg.info('goods_id: {}, 未知 good'.format(goods_id))
            pass

        return res
コード例 #23
0
    def get_pc_tb_sort_keywords_list(self) -> list:
        """
        获取pc tb 关键字
        :return:
        """
        # 存入数量较小, 避免长期增量导致后期更新量大
        headers = get_random_headers(
            connection_status_keep_alive=False,
            cache_control='',
        )
        body = Requests.get_url_body(
            url='https://www.taobao.com/',
            headers=headers,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.req_num_retries,
            proxy_type=PROXY_TYPE_HTTPS,)
        assert body != ''
        # self.lg.info(body)

        # 只获取主分类的关键字
        main_sort_key_list_sel = {
            'method': 'css',
            'selector': 'ul.service-bd li a ::text',
        }
        main_sort_list_key = parse_field(
            parser=main_sort_key_list_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        # pprint(main_sort_list_key)

        # 不需要的
        not_need_main_sort_key_tuple = (
            '卡券',
            '本地服务',
            'DIY',
            '二手车',
            '生鲜',
            '鲜花',
        )
        main_sort_list_key = list(tuple([item for item in main_sort_list_key if item not in not_need_main_sort_key_tuple]))
        # pprint(main_sort_list_key)

        return main_sort_list_key
コード例 #24
0
        def add_p_info(data1) -> dict:
            # 增加p_info
            p_info_selector = {
                'method': 'css',
                'selector': 'span.detail-attribute-item ::text',
            }
            p_info = parse_field(parser=p_info_selector, target_obj=body, logger=self.lg, is_first=False)
            assert p_info != [], 'p_info不为空list!'
            p_info = [{
                'name': i.split(':')[0],
                'unit': None,
                'value': i.split(':')[1]
            } for i in p_info]
            data1.update({
                'productFeatureList': p_info,
            })

            return data1
コード例 #25
0
ファイル: test_driver.py プロジェクト: yfeng2018/python-1
def test_driver_change_proxy():
    """
    测试firefox动态切换代理
    :return:
    """
    d = BaseDriver(
        # 可行
        type=PHANTOMJS,
        executable_path=PHANTOMJS_DRIVER_PATH,
        # type=FIREFOX,
        # executable_path=FIREFOX_DRIVER_PATH,

        # 无效
        # type=CHROME,
        # executable_path=CHROME_DRIVER_PATH,
        headless=True,
        driver_use_proxy=True,
        ip_pool_type=tri_ip_pool,
    )
    origin_ip_sel = {'method': 're', 'selector': '\"origin\": \"(.*?)\",'}
    url = 'https://httpbin.org/get'
    # url = 'https://www.baidu.com'

    for index in range(0, 5):
        body = d.get_url_body(
            url=url,
            timeout=20,
            change_proxy=True,
            change_user_agent=True,
        )
        if 'httpbin' in url:
            origin_ip = parse_field(
                parser=origin_ip_sel,
                target_obj=body,
            )
            print('origin_ip: {}'.format(origin_ip))
        else:
            print(body)

    try:
        del d
    except:
        pass
コード例 #26
0
ファイル: kaola_parse.py プロジェクト: yfeng2018/python-1
    def _get_parent_dir(self, body) -> str:
        """
        获取parent_dir
        :param body:
        :return: '面部清洁/洁面/丝芙兰'
        """
        parent_dir_selector = {
            'method': 're',
            'selector': 'category: \'(.*?)\',',
        }
        # self.lg.info(body)
        parent_dir_str = parse_field(parser=parent_dir_selector,
                                     target_obj=body,
                                     is_first=True,
                                     logger=self.lg)
        # pprint(parent_dir_str)
        parent_dir = '/'.join(parent_dir_str.split('-'))

        return parent_dir
コード例 #27
0
        def parse_video_url():
            video_url = parse_field(
                parser=self.parser_obj['video_info']['video_url'],
                target_obj=body,
                logger=self.lg,
            )
            assert video_url != ''

            if short_name == 'n15':
                # 用http, 部分地址https无法正常显示
                video_url = 'http:' + video_url if video_url != '' else ''

            elif short_name == '8xs':
                # eg: video_url: 'db9b743f134910dbc697fc9f1513428c/index.m3u8'
                # eg: https://8xche.com/v/da054a7415135d9c5d602baf027ff206/index.m3u8
                video_url = 'https://8xche.com/v/' + video_url \
                    if video_url != '' else ''

            else:
                pass

            return video_url
コード例 #28
0
    def get_cpolar_url(self) -> str:
        """
        获取新cpolar_url
        :return:
        """
        headers = get_random_headers(cache_control='', )
        headers.update({
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '******',
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Sec-Fetch-Site': 'same-origin',
            'referer': 'https://dashboard.cpolar.com/get-started',
        })
        url = 'https://dashboard.cpolar.com/status'
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            ip_pool_type=self.ip_pool_type,
            num_retries=self.req_num_retries,
            proxy_type=PROXY_TYPE_HTTPS,
            _session=self._s,
            verify=False,
        )
        assert body != ''
        # print(body)

        cpolar_url_sel = {
            'method': 'css',
            'selector': 'th a:nth-child(1) ::text',
        }
        cpolar_url = parse_field(
            parser=cpolar_url_sel,
            target_obj=body,
        )
        print('cpolar_url: {}'.format(cpolar_url))

        return cpolar_url
コード例 #29
0
    def get_wkb_search_res(self, k: str, default_sort_value: int=None) -> dict:
        """
        网课帮搜题
        :param k:
        :return:
        """
        headers = get_random_headers(
            user_agent_type=1,
            connection_status_keep_alive=False,)
        headers.update({
            'Proxy-Connection': 'keep-alive',
            'Origin': 'http://wangkebang.cn',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Referer': 'http://wangkebang.cn/m/',
        })

        data = {
            'w': k,
        }
        body = Requests.get_url_body(
            method='post',
            url='http://wangkebang.cn/m/',
            headers=headers,
            # cookies=cookies,
            data=data,
            verify=False,
            ip_pool_type=self.ip_pool_type,
            proxy_type=PROXY_TYPE_HTTPS,
            num_retries=self.req_num_retries,
            timeout=self.req_timeout,)
        assert body != ''
        # self.lg.info(body)

        # 只返回一个答案
        question_item_sel = {
            'method': 'css',
            'selector': 'div.layui-card-body span',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []

        question_desc_div_sel = {
            'method': 'css',
            'selector': 'span strong',
        }
        answer_div_sel = {
            'method': 'css',
            'selector': 'span strong',
        }

        # 存储返回一个答案的问题和结果
        one_res = {}
        for index, item in enumerate(question_item):
            if index == 0:
                try:
                    question_desc_div = parse_field(
                        parser=question_desc_div_sel,
                        target_obj=item,
                        logger=self.lg,
                    )
                    assert question_desc_div != ''
                    # 清洗
                    question_desc = fix_text(wash_sensitive_info(
                        data=question_desc_div,
                        replace_str_list=[],
                        add_sensitive_str_list=[
                            '<span .*?>',
                            '</span>',
                            '<strong>',
                            '</strong>',
                            '题目\:',
                        ],
                        is_default_filter=False,
                        is_lower=False,
                    ))
                except Exception:
                    continue

                one_res['question_desc'] = question_desc

            elif index == 1:
                try:
                    answer_div = parse_field(
                        parser=answer_div_sel,
                        target_obj=item,
                        logger=self.lg,
                    )
                    assert answer_div != ''
                    # 清洗
                    answer = fix_text(wash_sensitive_info(
                        data=answer_div,
                        replace_str_list=[],
                        add_sensitive_str_list=[
                            '<span .*?>',
                            '</span>',
                            '<strong>',
                            '</strong>',
                            '答案\:',
                        ],
                        is_default_filter=False,
                        is_lower=False,
                    ))
                except Exception:
                    continue

                one_res['answer'] = answer

            else:
                continue

        res = []
        ask_questions_result_item = AskQuestionsResultItem()
        ask_questions_result_item['question_desc'] = one_res['question_desc']
        ask_questions_result_item['answer'] = one_res['answer']
        res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] wkb, k: {}'.format(
            '+' if res != [] else '-',
            k,
        ))

        return {
            'k': k,
            'page_num': default_sort_value,           # 用于单个结果的排序
            'res': res,
        }
コード例 #30
0
    def get_finer_search_res(self, k: str, page_num: int) -> dict:
        """
        凡尔搜题
        :param k: 关键字
        :param page_num: 0开始
        :return:
        """
        headers = get_random_headers(cache_control='')
        headers.update({
            # 'Referer': 'https://www.finerit.com/tiku/search/?q=%E7%A4%BE%E4%BC%9A%E4%B8%BB%E4%B9%89&p=0',
            'Referer': 'https://www.finerit.com/',
        })
        params = (
            ('q', k),
            ('p', str(page_num)),
            # ('s_type', 'erya'),
        )
        # todo 他们网站也许也有人在用, 偶尔会无响应
        body = Requests.get_url_body(
            url='https://www.finerit.com/tiku/search/',
            headers=headers,
            params=params,
            # cookies=cookies,
            ip_pool_type=self.ip_pool_type,
            proxy_type=PROXY_TYPE_HTTPS,
            num_retries=self.req_num_retries,
            timeout=self.req_timeout,         #  测试发现10s速度较快, 且成功率可以
        )
        assert body != ''
        # self.lg.info(body)

        question_item_sel = {
            'method': 'css',
            'selector': 'div.resultItem',
        }
        question_desc_div_sel = {
            'method': 'css',
            'selector': 'div.itemHead a',
        }
        answer_div_sel = {
            'method': 'css',
            'selector': 'div.itemBody',
        }
        question_item = parse_field(
            parser=question_item_sel,
            target_obj=body,
            is_first=False,
            logger=self.lg,
        )
        assert question_item != []

        res = []
        for item in question_item:
            # 有序的
            try:
                question_desc_div = parse_field(
                    parser=question_desc_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert question_desc_div != ''
                answer_div = parse_field(
                    parser=answer_div_sel,
                    target_obj=item,
                    logger=self.lg,
                )
                assert answer_div != ''
                # 清洗
                question_desc = fix_text(wash_sensitive_info(
                    data=question_desc_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<div class=\"itemHead\">',
                        '</div>',
                        '<a .*?>',
                        '</a>',
                        '<span .*?>',
                        '</span>',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
                answer = fix_text(wash_sensitive_info(
                    data=answer_div,
                    replace_str_list=[],
                    add_sensitive_str_list=[
                        '<div class=\"itemBody\">',
                        '</div>',
                        '<p .*?>',
                        '</p>',
                        '答案:',
                    ],
                    is_default_filter=False,
                    is_lower=False,
                ))
            except Exception:
                continue

            ask_questions_result_item = AskQuestionsResultItem()
            ask_questions_result_item['question_desc'] = question_desc
            ask_questions_result_item['answer'] = answer
            res.append(dict(ask_questions_result_item))

        self.lg.info('[{}] k: {}, page_num: {}'.format(
            '+' if res != [] else '-',
            k,
            page_num,
        ))

        return {
            'k': k,
            'page_num': page_num,
            'res': res,
        }