def parse_static_img_url(): static_img_url = parse_field( parser=self.parser_obj['video_info']['static_img_url'], target_obj=body, logger=self.lg, ) if short_name == 'n15': static_img_url = 'https:' + static_img_url if static_img_url != '' else '' elif short_name == '8xs': if static_img_url != '': # eg: 'db9b743f134910dbc697fc9f1513428c/index.m3u8' or '8e774a4bf7a697dd00935023a401eec8/v.m3u8' video_id_sel = { 'method': 're', 'selector': '(\w+)/\w+\.m3u8', } # 从m3u8地址中生成静态图 video_id = parse_field( parser=video_id_sel, target_obj=static_img_url, logger=self.lg, ) static_img_url = 'https://8xcha.com/p/{}.jpg'.format(video_id)\ if video_id != '' else '' else: pass return static_img_url
def parse_body() -> dict: """ 解析 :return: """ nonlocal body # 多规格的最低价 # 会员价yx 部分显示错误, 改用big 价格加上优点, 两个一起用 tb_price_sel = { 'method': 'css', 'selector': 'div.goodsPriceTips span:nth-child(2) ::text', } big_price_sel = { 'method': 'css', 'selector': 'div.goodsPrice big ::text', } yd_sel = { 'method': 're', 'selector': '<span class=\"yiudianPrice\">\+(\d+)优点</span>' } # 根据sharetitle的描述价, 但是由于没有cookie就会显示的是新人价(所以意义不大) normal_price_sel = { 'method': 're', # 'selector': '我在.*?(\d+\.\d+)元抢到这个超值商品', 'selector': 'var goodsmemberPrice = \'(.*?)\';', } tb_price0 = parse_field( parser=tb_price_sel, target_obj=body, ) assert tb_price0 != '' big_price = parse_field( parser=big_price_sel, target_obj=body, ) assert big_price != '' yd = parse_field( parser=yd_sel, target_obj=body, ) assert yd != '' normal_price = parse_field( parser=normal_price_sel, target_obj=body, ) assert normal_price != '' # 会员价 tb_price0 = float(tb_price0).__round__(2) # 优点价 tb_price1 = (float(big_price) + float(yd) / 100).__round__(2) # 常规价 tb_price2 = float(normal_price).__round__(2) return { 'goods_id': goods_id, 'tb_price0': tb_price0, 'tb_price1': tb_price1, 'tb_price2': tb_price2, }
def parse(ok_video_id, body) -> dict: """ 解析 :param body: :return: """ m3u8_li_sel = { 'method': 'css', 'selector': 'div[id="2"] ul li ::text', } before_info_sel = { 'method': 're', 'selector': '(.*)\$', } video_play_url_sel = { 'method': 're', 'selector': '\$(.*)', } m3u8_li = parse_field( parser=m3u8_li_sel, target_obj=body, is_print_error=False, is_first=False, ) # pprint(m3u8_li) new_m3u8_li = [] for item in m3u8_li: try: before_info = parse_field( parser=before_info_sel, target_obj=item, is_print_error=False, ) # before_info可为空值 video_play_url = parse_field( parser=video_play_url_sel, target_obj=item, is_print_error=False, ) assert video_play_url != '' except AssertionError as e: # print(e) continue new_m3u8_li.append({ 'before_info': before_info, 'video_play_url': video_play_url, }) print('[{}] ok_video_id: {}'.format( '+' if new_m3u8_li != [] else '-', ok_video_id, )) return { 'ok_video_id': ok_video_id, 'm3u8_li': new_m3u8_li, }
def _get_port(**kwargs) -> str: tr = kwargs['tr'] port_selector = kwargs['port_selector'] port = parse_field(parser=port_selector, target_obj=tr) assert port != '', 'port为空值!' return port
def parse_body() -> dict: """ 解析 :return: """ nonlocal body # 多规格的最低价 # 会员价yx 部分显示错误, 改用big 价格加上优点, 两个一起用 tb_price_sel = { 'method': 'css', 'selector': 'div.goodsPriceTips span:nth-child(2) ::text', } big_price_sel = { 'method': 'css', 'selector': 'div.goodsPrice big ::text', } yd_sel = { 'method': 're', 'selector': '<span class=\"yiudianPrice\">\+(\d+)优点</span>' } tb_price0 = parse_field( parser=tb_price_sel, target_obj=body, ) assert tb_price0 != '' big_price = parse_field( parser=big_price_sel, target_obj=body, ) assert big_price != '' yd = parse_field( parser=yd_sel, target_obj=body, ) assert yd != '' # 会员价 tb_price0 = float(tb_price0).__round__(2) # 优点价 tb_price1 = (float(big_price) + float(yd) / 100).__round__(2) return { 'goods_id': goods_id, 'tb_price0': tb_price0, 'tb_price1': tb_price1, }
async def _get_all_brand_name_and_brand_id(self) -> list: """ 获取所有品牌名及其对应的brand_id :return: """ headers = self.get_random_phone_headers() headers.update({ 'authority': 'car.m.autohome.com.cn', 'referer': 'https://car.autohome.com.cn/', }) url = 'https://car.m.autohome.com.cn/' body = await unblock_request( url=url, headers=headers, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries, ) brand_name_selector = { 'method': 'css', 'selector': 'div#div_ListBrand ul li div span ::text', } brand_id_selector = { 'method': 'css', 'selector': 'div#div_ListBrand ul li div ::attr("v")', } brand_name_list = parse_field( parser=brand_name_selector, target_obj=body, is_first=False, ) brand_id_list = parse_field( parser=brand_id_selector, target_obj=body, is_first=False, ) tmp_brand_info_list = list(zip(brand_name_list, brand_id_list)) # pprint(tmp_brand_info_list) brand_info_list = [{ 'brand_name': item[0], 'brand_id': item[1], } for item in tmp_brand_info_list] # pprint(brand_info_list) return brand_info_list
def _get_ng_one_type_company_id_list_task(self, ip_pool_type, keyword, page_num, company_item_id_selector, num_retries=8, timeout=15) -> list: """ 获取ng单个keyword的某个页面num对应的所有company_id list(m站搜索) :param self: :param ip_pool_type: :param keyword: :param page_num: :param num_retries: :param timeout: :return: """ headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=True,) headers.update({ 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'accept': '*/*', # 'Referer': 'http://m.nanguo.cn/search/?q=%E6%88%91&l=zh-CN', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('q', str(keyword)), ('l', 'zh-CN'), ('loadmore', 'true'), ('p', str(page_num)), ) url = 'http://m.nanguo.cn/search/index/' body = Requests.get_url_body( url=url, headers=headers, params=params, ip_pool_type=ip_pool_type, num_retries=num_retries, timeout=timeout,) # lg.info(body) company_item_id_list = list(set(parse_field( parser=company_item_id_selector, target_obj=body, is_first=False, logger=lg))) # pprint(company_item_list) company_item_list = [{ 'company_id': item, } for item in company_item_id_list] lg.info('[{}] keyword: {}, page_num: {}'.format( '+' if company_item_list != [] else '-', keyword, page_num,)) collect() return company_item_list
def parse_video_name(): video_name = parse_field( parser=self.parser_obj['video_info']['video_name'], target_obj=body, logger=self.lg, ) assert video_name != '' return video_name
def _get_ip(parser, target_obj) -> str: """ 获取ip address :return: """ ip = parse_field(parser=parser, target_obj=target_obj) assert ip != '', '获取到的ip为空值!' return ip
def _get_ip_type(**kwargs) -> str: '''获取ip_type''' tr = kwargs['tr'] ip_type_selector = kwargs['ip_type_selector'] ip_type = parse_field(parser=ip_type_selector, target_obj=tr) # 可空 # assert ip_type != '', 'ip_type为空值!' # return 'http' if ip_type == 'HTTP' else 'https' return 'http' # 全部返回'http'
def _get_port(parser, target_obj): """ 获取ip的端口 :param parser: :param target_obj: :return: """ port = parse_field(parser=parser, target_obj=target_obj) assert port != '', '获取到的port为空值!' return port
def parse_dislike_num(): dislike_num = parse_field( parser=self.parser_obj['video_info']['dislike_num'], target_obj=body, logger=self.lg, ) try: dislike_num = int(dislike_num) except: dislike_num = 0 return dislike_num
def parse_collected_num(): collected_num = parse_field( parser=self.parser_obj['video_info']['collected_num'], target_obj=body, logger=self.lg, ) try: collected_num = int(collected_num) except: collected_num = 0 return collected_num
def parse_like_num(): like_num = parse_field( parser=self.parser_obj['video_info']['like_num'], target_obj=body, logger=self.lg, ) try: like_num = int(like_num) except: like_num = 0 return like_num
def parse(body) -> list: title_sel = { 'method': 'css', 'selector': 'p.list_tit a ::text', } url_sel = { 'method': 'css', 'selector': 'p.list_tit a ::attr("href")', } desc_sel = { 'method': 'css', 'selector': 'p.desc.left ::text', } title_list = parse_field( parser=title_sel, target_obj=body, is_first=False, ) # pprint(title_list) url_list = parse_field( parser=url_sel, target_obj=body, is_first=False, ) # pprint(url_list) desc_list = parse_field( parser=desc_sel, target_obj=body, is_first=False, ) # pprint(desc_list) _ = list(zip(title_list, url_list, desc_list)) # pprint(_) return [{ 'title': item[0], 'url': item[1], 'desc': item[2], } for item in _]
def unblock_judge_ip_is_anonymity(ip_address='', port=0, httpbin=True, use_proxy=True, timeout=10, logger=None,) -> str: """ 阻塞返回当前的ip地址 :param ip_address: :param port: :param httpbin: :param use_proxy: :param timeout: :return: """ def _get_proxies(): return { # 暴露原地址 # 'http': ip_address + ':' + str(port), 'https': ip_address + ':' + str(port), } url = 'https://www.whatismybrowser.com/' if not httpbin else 'https://www.httpbin.org/get' headers = get_random_headers(user_agent_type=1,) proxies = _get_proxies() if use_proxy else {} body = Requests.get_url_body( url=url, headers=headers, use_proxy=use_proxy, proxies=proxies, timeout=timeout, verify=False,) # print(body) if not httpbin: now_ip_selector = { 'method': 'css', # 'selector': 'div#ip-address:nth-child(2) .detected-column a:nth-child(1) ::text', 'selector': 'div#ip-address.detection-block .detected-column a:nth-child(1) ::text', } now_ip = parse_field( parser=now_ip_selector, target_obj=body, is_first=True,) else: now_ip = json_2_dict( json_str=body, default_res={},).get('origin', '') return now_ip
def add_div_desc(data1) -> dict: # 增加div_desc div_desc_selector = { 'method': 're', 'selector': '\"detailUrl\": \"(.*?)\"}<', } div_desc_url = parse_field(parser=div_desc_selector, target_obj=body, logger=self.lg) # self.lg.info(div_desc_url) assert div_desc_url != '', 'div_desc_url为空值!' data1.update({'detailUrl': div_desc_url}) return data1
def _get_ori_proxy_list(parser, target_obj) -> list: """ 获取origin proxy_list地址 :param parser: :param target_obj: :return: """ # print(target_obj) proxy_list = parse_field( parser=parser, target_obj=target_obj) assert proxy_list != [], 'proxy_list为空list!' return proxy_list
def _get_ip(**kwargs) -> str: tr = kwargs['tr'] ip_selector = kwargs['ip_selector'] ip = parse_field(parser=ip_selector, target_obj=tr) assert ip != '', 'ip为空值!' ip = re.compile(r'<script .*?</script>').sub('', ip) if re.compile('\d+').findall(ip) == []: # 处理不是ip地址 raise NotIpException lg.info(str(ip)) ip = re.compile('\d+\.\d+\.\d+\.\d+').findall(ip)[0] assert ip != '', 'ip为空值!' return ip
def _get_replenishment_status(self, goods_id, body) -> bool: """ 获取某goods_id是否为缺货状态! :param goods_id: :return: True 缺货状态 """ # 判断是否下架 or 缺货 is_replenishment_status_text_sel = { 'method': 're', 'selector': 'bfd_stock: (\d+) ,', } is_replenishment_status_text = parse_field( parser=is_replenishment_status_text_sel, target_obj=body, is_first=True, is_print_error=False, ) is_replenishment_status_text_sel2 = { 'method': 'css', 'selector': 'div.btn_disabled ::text', } is_replenishment_status_text2 = parse_field( parser=is_replenishment_status_text_sel2, target_obj=body, is_first=True, is_print_error=False, ) # print(is_replenishment_status_text) # print(is_replenishment_status_text2) if is_replenishment_status_text == '0'\ or is_replenishment_status_text2 == '商品已经下架了~': # 有货为'1', 补货中为'0' return True else: return False
def add_all_img_list(data1) -> dict: # 增加示例图 all_img_list_selector = { 'method': 'css', 'selector': 'div#J_Detail_ImageSlides div.swipe-pane img ::attr("swipe-lazy-src")', } all_img_list = parse_field(parser=all_img_list_selector, target_obj=body, logger=self.lg, is_first=False) assert all_img_list != [], 'all_img_list不为空list!' all_img_list = [{ 'originalImageURI': i } for i in all_img_list] data1.update({ 'imageList': all_img_list }) return data1
def judge_qyh_is_tb_by_goods_id(self, goods_id): """ 根据商品id 判断是否是tb商品 :param goods_id: :return: 0 tb|1 tm | -1 未知 """ headers = get_random_headers( connection_status_keep_alive=False, cache_control='', ) headers.update({ 'authority': 'www.quanyoubuy.com', }) url = 'https://www.quanyoubuy.com/item/index/iid/{}.html'.format(goods_id) body = Requests.get_url_body( url=url, headers=headers, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=7,) assert body != '' btn_text_sel = { 'method': 'css', 'selector': 'div.product-info a.go_btn span ::text', } btn_text = parse_field( parser=btn_text_sel, target_obj=body, is_print_error=False, logger=self.lg, ) # self.lg.info(btn_text) assert btn_text != '' res = -1 if '天猫' in btn_text: self.lg.info('goods_id: {}, tm good'.format(goods_id)) res = 1 elif '淘宝' in btn_text: self.lg.info('goods_id: {}, tb good'.format(goods_id)) res = 0 else: self.lg.info('goods_id: {}, 未知 good'.format(goods_id)) pass return res
def get_pc_tb_sort_keywords_list(self) -> list: """ 获取pc tb 关键字 :return: """ # 存入数量较小, 避免长期增量导致后期更新量大 headers = get_random_headers( connection_status_keep_alive=False, cache_control='', ) body = Requests.get_url_body( url='https://www.taobao.com/', headers=headers, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries, proxy_type=PROXY_TYPE_HTTPS,) assert body != '' # self.lg.info(body) # 只获取主分类的关键字 main_sort_key_list_sel = { 'method': 'css', 'selector': 'ul.service-bd li a ::text', } main_sort_list_key = parse_field( parser=main_sort_key_list_sel, target_obj=body, is_first=False, logger=self.lg, ) # pprint(main_sort_list_key) # 不需要的 not_need_main_sort_key_tuple = ( '卡券', '本地服务', 'DIY', '二手车', '生鲜', '鲜花', ) main_sort_list_key = list(tuple([item for item in main_sort_list_key if item not in not_need_main_sort_key_tuple])) # pprint(main_sort_list_key) return main_sort_list_key
def add_p_info(data1) -> dict: # 增加p_info p_info_selector = { 'method': 'css', 'selector': 'span.detail-attribute-item ::text', } p_info = parse_field(parser=p_info_selector, target_obj=body, logger=self.lg, is_first=False) assert p_info != [], 'p_info不为空list!' p_info = [{ 'name': i.split(':')[0], 'unit': None, 'value': i.split(':')[1] } for i in p_info] data1.update({ 'productFeatureList': p_info, }) return data1
def test_driver_change_proxy(): """ 测试firefox动态切换代理 :return: """ d = BaseDriver( # 可行 type=PHANTOMJS, executable_path=PHANTOMJS_DRIVER_PATH, # type=FIREFOX, # executable_path=FIREFOX_DRIVER_PATH, # 无效 # type=CHROME, # executable_path=CHROME_DRIVER_PATH, headless=True, driver_use_proxy=True, ip_pool_type=tri_ip_pool, ) origin_ip_sel = {'method': 're', 'selector': '\"origin\": \"(.*?)\",'} url = 'https://httpbin.org/get' # url = 'https://www.baidu.com' for index in range(0, 5): body = d.get_url_body( url=url, timeout=20, change_proxy=True, change_user_agent=True, ) if 'httpbin' in url: origin_ip = parse_field( parser=origin_ip_sel, target_obj=body, ) print('origin_ip: {}'.format(origin_ip)) else: print(body) try: del d except: pass
def _get_parent_dir(self, body) -> str: """ 获取parent_dir :param body: :return: '面部清洁/洁面/丝芙兰' """ parent_dir_selector = { 'method': 're', 'selector': 'category: \'(.*?)\',', } # self.lg.info(body) parent_dir_str = parse_field(parser=parent_dir_selector, target_obj=body, is_first=True, logger=self.lg) # pprint(parent_dir_str) parent_dir = '/'.join(parent_dir_str.split('-')) return parent_dir
def parse_video_url(): video_url = parse_field( parser=self.parser_obj['video_info']['video_url'], target_obj=body, logger=self.lg, ) assert video_url != '' if short_name == 'n15': # 用http, 部分地址https无法正常显示 video_url = 'http:' + video_url if video_url != '' else '' elif short_name == '8xs': # eg: video_url: 'db9b743f134910dbc697fc9f1513428c/index.m3u8' # eg: https://8xche.com/v/da054a7415135d9c5d602baf027ff206/index.m3u8 video_url = 'https://8xche.com/v/' + video_url \ if video_url != '' else '' else: pass return video_url
def get_cpolar_url(self) -> str: """ 获取新cpolar_url :return: """ headers = get_random_headers(cache_control='', ) headers.update({ 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '******', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Sec-Fetch-Site': 'same-origin', 'referer': 'https://dashboard.cpolar.com/get-started', }) url = 'https://dashboard.cpolar.com/status' body = Requests.get_url_body( url=url, headers=headers, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries, proxy_type=PROXY_TYPE_HTTPS, _session=self._s, verify=False, ) assert body != '' # print(body) cpolar_url_sel = { 'method': 'css', 'selector': 'th a:nth-child(1) ::text', } cpolar_url = parse_field( parser=cpolar_url_sel, target_obj=body, ) print('cpolar_url: {}'.format(cpolar_url)) return cpolar_url
def get_wkb_search_res(self, k: str, default_sort_value: int=None) -> dict: """ 网课帮搜题 :param k: :return: """ headers = get_random_headers( user_agent_type=1, connection_status_keep_alive=False,) headers.update({ 'Proxy-Connection': 'keep-alive', 'Origin': 'http://wangkebang.cn', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://wangkebang.cn/m/', }) data = { 'w': k, } body = Requests.get_url_body( method='post', url='http://wangkebang.cn/m/', headers=headers, # cookies=cookies, data=data, verify=False, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=self.req_num_retries, timeout=self.req_timeout,) assert body != '' # self.lg.info(body) # 只返回一个答案 question_item_sel = { 'method': 'css', 'selector': 'div.layui-card-body span', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] question_desc_div_sel = { 'method': 'css', 'selector': 'span strong', } answer_div_sel = { 'method': 'css', 'selector': 'span strong', } # 存储返回一个答案的问题和结果 one_res = {} for index, item in enumerate(question_item): if index == 0: try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<span .*?>', '</span>', '<strong>', '</strong>', '题目\:', ], is_default_filter=False, is_lower=False, )) except Exception: continue one_res['question_desc'] = question_desc elif index == 1: try: answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<span .*?>', '</span>', '<strong>', '</strong>', '答案\:', ], is_default_filter=False, is_lower=False, )) except Exception: continue one_res['answer'] = answer else: continue res = [] ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = one_res['question_desc'] ask_questions_result_item['answer'] = one_res['answer'] res.append(dict(ask_questions_result_item)) self.lg.info('[{}] wkb, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, # 用于单个结果的排序 'res': res, }
def get_finer_search_res(self, k: str, page_num: int) -> dict: """ 凡尔搜题 :param k: 关键字 :param page_num: 0开始 :return: """ headers = get_random_headers(cache_control='') headers.update({ # 'Referer': 'https://www.finerit.com/tiku/search/?q=%E7%A4%BE%E4%BC%9A%E4%B8%BB%E4%B9%89&p=0', 'Referer': 'https://www.finerit.com/', }) params = ( ('q', k), ('p', str(page_num)), # ('s_type', 'erya'), ) # todo 他们网站也许也有人在用, 偶尔会无响应 body = Requests.get_url_body( url='https://www.finerit.com/tiku/search/', headers=headers, params=params, # cookies=cookies, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=self.req_num_retries, timeout=self.req_timeout, # 测试发现10s速度较快, 且成功率可以 ) assert body != '' # self.lg.info(body) question_item_sel = { 'method': 'css', 'selector': 'div.resultItem', } question_desc_div_sel = { 'method': 'css', 'selector': 'div.itemHead a', } answer_div_sel = { 'method': 'css', 'selector': 'div.itemBody', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] res = [] for item in question_item: # 有序的 try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<div class=\"itemHead\">', '</div>', '<a .*?>', '</a>', '<span .*?>', '</span>', ], is_default_filter=False, is_lower=False, )) answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<div class=\"itemBody\">', '</div>', '<p .*?>', '</p>', '答案:', ], is_default_filter=False, is_lower=False, )) except Exception: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] k: {}, page_num: {}'.format( '+' if res != [] else '-', k, page_num, )) return { 'k': k, 'page_num': page_num, 'res': res, }