def get_p_info_list(self, p_info_url): ''' 得到详情介绍信息 :param p_info_url: :return: 返回一个list ''' # 使用requests p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers) if p_info_body == '': print('获取到的p_info_body为空值, 此处跳过!') p_info_body = '{}' try: p_info_data = json.loads(p_info_body) tmp_p_info = p_info_data.get('perportieslist', []) except Exception: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 tmp_p_info = [] if tmp_p_info != []: p_info = [{ 'p_name': item.get('name', ''), 'p_value': item.get('value'), } for item in tmp_p_info] else: p_info = tmp_p_info return p_info
def get_jump_to_url_and_is_hk(self, body): ''' 得到跳转地址和is_hk :param body: 待解析的url的body :return: (body, sign_direct_url, is_hk) | 类型: str, str, boolean ''' if re.compile(r'_sign_direct_url = ').findall( body) != []: # 表明是跳转,一般会出现这种情况的是拼团商品 # 出现跳转时 try: sign_direct_url = re.compile( r"_sign_direct_url = '(.*?)';").findall(body)[0] print('*** 获取到跳转地址为: ', sign_direct_url) except IndexError: sign_direct_url = '' print('获取跳转的地址时出错!') body = MyRequests.get_url_body(url=sign_direct_url, headers=self.headers, had_referer=True) if re.compile(r'://m.miyabaobei.hk/').findall( sign_direct_url) != []: # 表示为全球购商品 print('*** 此商品为全球购商品!') is_hk = True else: is_hk = False else: is_hk = False sign_direct_url = '' return (body, sign_direct_url, is_hk)
async def _get_target_url_and_content_id_and_csid(self, taobao_short_url): ''' 根据给与的淘宝分享短链接, 得到target_url, content_id, csid :param taobao_short_url: :return: ''' if re.compile(r'contentId').findall(taobao_short_url) != []: # 先检查是否已为目标地址 target_url = taobao_short_url else: body = MyRequests.get_url_body(url=taobao_short_url, headers=self.headers) # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空值, 出错短链接地址: {0}'.format( str(taobao_short_url))) return '', '', '' try: # 获取短连接的目标地址 target_url = re.compile('var url = \'(.*?)\';').findall( body)[0] # self.my_lg.info(str(target_url)) except IndexError: self.my_lg.error( '获取target_url的时候IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) target_url = '' try: # 得到contentId content_id = re.compile('contentId=(\d+)').findall(target_url)[0] # self.my_lg.info(content_id) except IndexError: self.my_lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) content_id = '' try: # 得到csid csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall( target_url)[0] # self.my_lg.info(csid) except IndexError: self.my_lg.info('此链接为无csid情况的链接...') # self.my_lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) csid = '' try: tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0] except IndexError: tag_name = '' try: tag = re.compile('tag=(.*?)&').findall(target_url)[0] except IndexError: tag = '' return target_url, content_id, csid, tag_name, tag
def get_all_img_url(self, goods_id, is_hk): ''' 得到all_img_url :param goods_id: :param is_hk: :return: ''' if is_hk is True: # 全球购 tmp_url_2 = 'https://www.miyabaobei.hk/item-' + str( goods_id) + '.html' else: tmp_url_2 = 'https://www.mia.com/item-' + str(goods_id) + '.html' tmp_body_2 = MyRequests.get_url_body(url=tmp_url_2, headers=self.headers, had_referer=True) # print(Selector(text=tmp_body_2).css('div.small').extract()) if tmp_body_2 == '': print('请求tmp_body_2为空值, 此处先跳过!') return '' all_img_url = [] for item in Selector(text=tmp_body_2).css('div.small img').extract(): # print(item) tmp_img_url = Selector( text=item).css('img::attr("src")').extract_first() all_img_url.append({'img_url': tmp_img_url}) return all_img_url
def _deal_with_every_article(self): home_articles_link_list = self._get_xiaohongshu_home_aritles_info() pprint(home_articles_link_list) for item in home_articles_link_list: # eg: [{'id': '5b311bfc910cf67e693d273e','share_link': 'https://www.xiaohongshu.com/discovery/item/5b311bfc910cf67e693d273e'},...] article_id = item.get('id', '') article_link = item.get('article_link', '') if article_link != '': body = MyRequests.get_url_body(url=article_link, headers=self.headers) try: article_info = re.compile( 'window.__INITIAL_SSR_STATE__=(.*?)</script>').findall( body)[0] except IndexError: self.my_lg.error('获取article_info时IndexError!请检查!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) continue article_info = self._wash_article_info( self.json_2_dict(article_info)) pprint(article_info) sleep(self.CRAWL_ARTICLE_SLEEP_TIME) else: pass
def get_aweme_api_videos_info(): headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'cache-control': 'max-age=0', 'authority': 'www.douyin.com', 'cookie': '_ba=BA0.2-20180330-5199e-OeUxtvwJvy5ElpWGFLId; _ga=GA1.2.390071767.1522391891; sso_login_status=1; tt_webid=6540458660484122126; __tea_sdk__user_unique_id=10_; __tea_sdk__ssid=e88eef4a-ec1f-497d-b2c7-301239bfdc67; login_flag=d6ee54ffebe3021c3fb67ff863970736; sessionid=7bdfd0e36df78f38c25abd13f0eff3cc; uid_tt=644e532b271dae498b62c659de17afdf; sid_tt=7bdfd0e36df78f38c25abd13f0eff3cc; sid_guard="7bdfd0e36df78f38c25abd13f0eff3cc|1522819290|2591999|Fri\\054 04-May-2018 05:21:29 GMT"', } params = ( ('user_id', '94470216810'), ('max_cursor', '0'), ('count', '20'), ) url = 'https://www.douyin.com/aweme/v1/aweme/post/' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # print(body) deal_with_data(body=body)
def needIdenCode(self): #第一次登录获取验证码尝试,构建request # request = Request(self.loginURL, self.postData, self.loginHeaders) # response = self.opener.open(request) #得到第一次登录尝试的相应 # content = response.read().decode('gbk') # status = response.getcode() # 获取状态吗 response = requests.post(url=self.loginURL, headers=self.loginHeaders, data=json.dumps(self.postData), proxies=MyRequests._get_proxies()) content = response.content.decode('gbk') status = response.status_code #状态码为200,获取成功 if status == 200: print("获取请求成功") #u8bf7u8f93u5165u9a8cu8bc1u7801这六个字是请输入验证码的utf-8编码 pattern = re.compile(u'u8bf7u8f93u5165u9a8cu8bc1u7801', re.S) result = re.search(pattern, content) #如果找到该字符,代表需要输入验证码 if result: print("此次安全验证异常,您需要输入验证码") return content #否则不需要 else: print("此次安全验证通过,您这次不需要输入验证码") return False else: print("获取请求失败")
def my_requests(): bsdb = get_bsdb() user = session['user_num'] my_request = MyRequests(user, bsdb) try: requests = my_request.get_all_open_requests() requests_dicts = [dict(row) for row in requests] for trade in requests_dicts: print(trade['tradeAge']) except Exception: app.logger.error("Couldn't fill my-requests") requests_dicts = [] if len(requests_dicts) == 0: return render_template('user/no-trades.html', no_sent_requests=True) else: return render_template('user/my-requests.html', requests=requests_dicts)
def _get_taobao_goods_keywords_goods_id_list(self, keyword): ''' 获取该keywords的商品的goods_id_list :param keyword: (id, keyword) :return: a list ''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS)-1)], 'accept': '*/*', # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306', 'authority': 's.taobao.com', # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw', } # 获取到的为淘宝关键字搜索按销量排名 params = ( ('data-key', 'sort'), ('data-value', 'sale-desc'), ('ajax', 'true'), # ('_ksTS', '1528171408340_395'), ('callback', 'jsonp396'), ('q', keyword[1]), ('imgfile', ''), ('commend', 'all'), ('ssid', 's5-e'), ('search_type', 'item'), ('sourceId', 'tb.index'), # ('spm', 'a21bo.2017.201856-taobao-item.1'), ('ie', 'utf8'), # ('initiative_id', 'tbindexz_20170306'), ) s_url = 'https://s.taobao.com/search' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) if body == '': return [] else: try: data = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.my_lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1])) return [] data = self.json_str_2_dict(json_str=data) if data == {}: self.my_lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format(keyword[1])) return [] else: goods_id_list = data.get('mainInfo', {}).get('traceInfo', {}).get('traceData', {}).get('allNids', []) if goods_id_list is None or goods_id_list == []: self.my_lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) return [] else: return goods_id_list
def get_true_sku_info(self, sku_info): ''' 获取每个规格对应价格跟规格以及其库存 :param sku_info: :return: {} 空字典表示出错 | (true_sku_info, i_s) ''' goods_id_str = '-'.join([item.get('goods_id') for item in sku_info]) # print(goods_id_str) tmp_url = 'https://p.mia.com/item/list/' + goods_id_str # print(tmp_url) tmp_body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(tmp_body) try: tmp_data = json.loads(tmp_body).get('data', []) # pprint(tmp_data) except Exception as e: print('json.loads转换tmp_body时出错!') tmp_data = [] self.result_data = {} return {} true_sku_info = [] i_s = {} for item_1 in sku_info: for item_2 in tmp_data: if item_1.get('goods_id') == str(item_2.get('id', '')): i_s = item_2.get('i_s', {}) # print(i_s) for item_3 in i_s.keys(): tmp = {} if item_3 == 'SINGLE': spec_value = item_1.get('color_name') else: spec_value = item_1.get( 'color_name') + '|' + item_3 normal_price = str(item_2.get('mp')) detail_price = str(item_2.get('sp')) img_url = item_1.get('img_url') rest_number = i_s.get(item_3) if rest_number == 0: pass else: tmp['spec_value'] = spec_value tmp['normal_price'] = normal_price tmp['detail_price'] = detail_price tmp['img_url'] = img_url tmp['rest_number'] = rest_number true_sku_info.append(tmp) return (true_sku_info, i_s)
def get_div_from_pc_div_url(self, url, goods_id): ''' 根据pc描述的url模拟请求获取描述的div :return: str ''' t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 params_data_1 = { 'id': goods_id, 'type': '1', } tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/' _params = ( ('appKey', '12574478'), ('t', t), ('api', 'mtop.taobao.detail.getdesc'), ('v', '6.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('timeout', '20000'), ('callback', 'mtopjsonp1'), ('data', json.dumps(params_data_1)), ) url = tmp_url + '?' + urlencode(_params) last_url = re.compile(r'\+').sub('', url) # 转换后得到正确的url请求地址(替换'+') # self.my_lg.info(last_url) data = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14, num_retries=3) if data == '': self.my_lg.error('获取到的div_desc为空值!请检查! 出错goods_id: {0}'.format(goods_id)) return '' try: data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0] # 贪婪匹配匹配所有 # self.my_lg.info(str(data)) except IndexError as e: self.my_lg.error('获取data时, IndexError出错! 出错goods_id: {0}'.format(goods_id)) self.my_lg.exception(e) return '' try: data = json.loads(data) # pprint(data) except JSONDecodeError: self.my_lg.error('json转换data时出错, 请检查!') data = {} div = data.get('data', {}).get('pcDescContent', '') # self.my_lg.info(str(div)) div = self.deal_with_div(div) # self.my_lg.info(div) return div
def test_requests(): url = 'https://superonesfazai.github.io/' start_time = time.time() for _ in range(200): body = MyRequests.get_url_body(url=url, headers=headers) if body != '': print('success') else: print(body) end_time = time.time() print('requests用时:', end_time - start_time)
def getRandomExternalLink(startingPage): html = MyRequests.get_url_body(url=startingPage, headers=headers) bsObj = BeautifulSoup(html, "html.parser") externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc) if len(externalLinks) == 0: print("没有外部链接,准备遍历整个网站") domain = urlparse(startingPage).scheme + "://" + urlparse( startingPage).netloc internalLinks = getInternalLinks(bsObj, domain) return getRandomExternalLink(internalLinks[random.randint( 0, len(internalLinks) - 1)]) else: return externalLinks[random.randint(0, len(externalLinks) - 1)]
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params) # self.my_lg.info(str(body)) _data = self._json_2_dict(body).get('wareDetailComment', {}).get('commentInfoList', []) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id:{0}'.format(goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS)-1)], 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: data = self.json_str_2_dict(json_str=body) if data == {}: self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.my_lg.exception(e) self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) return [] return goods_id_list
def _get_aweme_api_videos_info(self, user_id): self.user_id = user_id params = ( ('user_id', self.user_id), ('max_cursor', '0'), ('count', '20'), ) url = 'https://www.douyin.com/aweme/v1/aweme/post/' body = MyRequests.get_url_body(url=url, headers=self.headers, params=params) # print(body) self.deal_with_data(body=body)
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' pintuan_goods_id_list = [] for page in range(0, 100): tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format( str(page) ) print('正在抓取的页面地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if body == '': body = '{}' try: tmp_data = json.loads(body) tmp_data = tmp_data.get('data', {}).get('goods', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) sleep(.5) if tmp_data == []: print('该tmp_url得到的goods为空list, 此处跳过!') break tmp_pintuan_goods_id_list = [{ 'goods_id': item.get('goods_id', ''), 'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))), 'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))), 'all_sell_count': str(item.get('join_number_int', '')), 'page': page, } for item in tmp_data] # print(tmp_pintuan_goods_id_list) for item in tmp_pintuan_goods_id_list: if item.get('goods_id', '') not in [item2.get('goods_id', '') for item2 in pintuan_goods_id_list]: pintuan_goods_id_list.append(item) print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list)) print(pintuan_goods_id_list) return pintuan_goods_id_list
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] for index in range(1, 1000): # 0跟1返回一样,所有从1开始遍历 tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(index) + '/0/' print('正在抓取: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') if tmp_data.get('data_list', []) == []: print('得到的data_list为[], 此处跳过!') break else: # print(tmp_data) data_list = [{ 'goods_id': item.get('sku', ''), 'sub_title': item.get('intro', ''), 'pid': index, } for item in tmp_data.get('data_list', [])] # pprint(data_list) for item in data_list: goods_list.append(item) sleep(.5) pprint(goods_list) self.deal_with_data(goods_list=goods_list) sleep(8) return None
def get_one_page_goods_info(self, *params): ''' 得到一个页面的html代码 :param params: 待传入的参数 :return: '{}' or str ''' gender, page = params tmp_url = 'https://api.chuchujie.com/api/' client = { "ageGroup": "AG_0to24", "channel": "QD_web_webkit", "deviceId": "0", "gender": gender, # '0' -> 女 | '1' -> 男 "imei": "0", "packageName": "com.culiu.purchase", "platform": "wap", "sessionId": "0", "shopToken": "0", "userId": "0", "version": "1.0", "xingeToken": "" } query = { "group": 4, "module": "99", "page": page, "tab": "all" } # 切记: Query String Parameters直接这样编码发送即可 # 如果是要post的数据就得使用post的方法 data = { 'client': json.dumps(client), 'query': json.dumps(query), 'page': page } body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=data) if body == '': body = '{}' return body
def get_div_desc_body(self, div_desc_url): ''' 得到div_desc的html页面 :param div_desc_url: :return: str类型的data, 出错的情况下返回{} ''' # 使用requests div_desc_body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers) if div_desc_body == '': div_desc_body = '{}' # 使用phantomjs # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url) # # print(div_desc_body) # if div_desc_body == '': # div_desc_body = '{}' # else: # try: # div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0] # div_desc_body = re.compile(r'>').sub('>', div_desc_body) # div_desc_body = re.compile(r'<').sub('<', div_desc_body) # except: # div_desc_body = '{}' try: div_desc_data = json.loads(div_desc_body) tmp_body = div_desc_data.get('data', '') except Exception: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 tmp_body = '' # 清洗 tmp_body = re.compile( r'<div class=\"by_deliver\">.*?</div></div>').sub('', tmp_body) tmp_body = re.compile(r'src=.*? />').sub('/>', tmp_body) tmp_body = re.compile(r'data-url=').sub('src=\"', tmp_body) tmp_body = re.compile(r' />').sub( '\" style="height:auto;width:100%;"/>', tmp_body) if tmp_body != '': tmp_body = '<div>' + tmp_body + '</div>' return tmp_body
class TestMthod(unittest.TestCase): def setUp(self): self.run = MyRequests() @unittest.skip("test_01") def test_01(self): url = "http://localhost:8000/login/" data = {"username": "******", "password": "******"} #res=my_mock(self.run.run_main,data,url,"POST",data) header = '' res = self.run.run_main(url, "POST", data, header) def test_02(self): count = 0 for i in range(1, 1002): if i % 2 == 0 or i % 3 == 0 or i % 5 == 0: count = count + 1 print(count)
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' mia_base_number = MIA_BASE_NUMBER while mia_base_number < MIA_MAX_NUMBER: tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( mia_base_number) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '' or body == '[]': print('mia_base_number为: ', mia_base_number) print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') tmp_hour = tmp_data.get('p_info', {}).get('start_time', '')[11:13] if tmp_hour == '22': # 过滤掉秒杀时间为22点的 print('--- 销售时间为22点,不抓取!') pass else: print(tmp_data) print('mia_base_number为: ', mia_base_number) pid = mia_base_number begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') item_list = tmp_data.get('item_list', []) self.deal_with_data(pid, begin_time, end_time, item_list) sleep(.35) mia_base_number += 1
def traversal_hour_timestamp(self, item): ''' 遍历每个需求的整点时间戳 :param item: :return: ''' # 先遍历today的需求的整点时间戳 tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format( str(item)) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('item为: ', item) print('获取到的body为空值! 此处跳过') else: try: body = re.compile('null\((.*)\)').findall(body)[0] except Exception: print('re匹配body中的数据时出错!') body = '{}' try: tmp_data = json.loads(body) except: print('json.loads转换body时出错, 此处跳过!') tmp_data = {} if tmp_data == {}: print('tmp_data为空{}!') pass else: # pprint(tmp_data) # print(tmp_data) event_time = item item_list = tmp_data.get('data', {}).get('list', []) self.deal_with_data(event_time, item_list) sleep(MOGUJIE_SLEEP_TIME)
def get_stock_info_dict(self, stock_info_url): ''' 得到实时库存信息 :param stock_info_url: :return: 返回dict类型 ''' stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers) if stock_info_body == '': print('获取到的stock_info_body为空值!') stock_info_body = '{}' try: stock_info_data = json.loads(stock_info_body) tmp_stock_info = stock_info_data.get('data', {}) except Exception: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 tmp_stock_info = {} return tmp_stock_info
def get_item_list(self, event_time): ''' 得到event_time中所有的商品信息 :param event_time: :return: item_list 类型 list ''' tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format( str(event_time)) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空值! 此处跳过') item_list = '' else: try: body = re.compile('null\((.*)\)').findall(body)[0] except Exception: print('re匹配body中的数据时出错!') body = '{}' try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') if tmp_data == {}: print('tmp_data为空{}!') item_list = [] else: # pprint(tmp_data) # print(tmp_data) item_list = tmp_data.get('data', {}).get('list', []) sleep(.5) return item_list
def _judge_is_taobao_head_img(self, url): ''' 判断是否为淘宝默认头像地址 :param url: :return: ''' tmp_proxies = MyRequests._get_proxies() try: _res = requests.get(url=url, headers=self.headers, proxies=tmp_proxies) self.my_lg.info(str(_res.url)) if _res.url == 'https://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg': return True else: return False except: self.my_lg.info('检测图片地址时网络错误! 跳过!') return False
def get_one_page_all_goods_list(self, *params): ''' 得到一个页面地址的所有商品list :return: str | list 类型 ''' page = params[0] all_goods_list = [] tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} return '网络错误!' this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: return [] for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) # sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] return all_goods_list
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' zid_list = [] for page in range(0, 100): tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) tmp_body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if tmp_body == '': tmp_body = '{}' try: tmp_data = json.loads(tmp_body) tmp_data = tmp_data.get('objects', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) if tmp_data == []: print('该tmp_url得到的object为空list, 此处跳过!') break tmp_zid_list = [(item.get('product', {}).get('zid', ''), page) for item in tmp_data] # print(tmp_zid_list) for item in tmp_zid_list: if item != '': zid_list.append(item) zid_list = list(set(zid_list)) print('该zid_list的总个数为: ', len(zid_list)) print(zid_list) return zid_list
def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': HEADERS[randint(0, len(HEADERS)-1)], 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css('div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.my_lg.exception(e) self.my_lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) goods_id_list = [] return goods_id_list
def getAllExternalLinks(siteUrl): domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc html = MyRequests.get_url_body(url=siteUrl, headers=headers) bsObj = BeautifulSoup(html, 'lxml') internalLinks = getInternalLinks(bsObj, domain) externalLinks = getExternalLinks(bsObj, domain) f = open('result.txt', 'w') # 收集外链 for link in externalLinks: if link not in allExtLinks: allExtLinks.add(link) # print(link) f.writelines(link + '\n') print("即将获取的外部链接的URL是:" + link) # 收集内链 for link in internalLinks: if link not in allIntLinks: print("即将获取内部链接的URL是:" + link) allIntLinks.add(link) getAllExternalLinks(link) f.writelines(link + '\n')