async def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: ''' s_time = time.time() goods_list = [] my_phantomjs = MyPhantomjs() for key in self.tab_dict: self.msg = '正在抓取的分类为: ' + key self.my_lg.info(self.msg) for index in range(1, 20): item_list = await self.get_one_page_goods_list(my_phantomjs=my_phantomjs, key=key, tab=self.tab_dict[key], index=index) all_goods_id = list(set([s.get('goods_id', '') for s in goods_list])) for item in item_list: if item.get('goods_id', '') not in all_goods_id: goods_list.append(item) # await asyncio.sleep(.5) try: del my_phantomjs except: pass self.my_lg.info(str(goods_list)) self.my_lg.info('本次抓到所有拼团商品个数为: ' + str(len(goods_list))) e_time = time.time() self.my_lg.info('总用时:' + str(e_time-s_time)) await asyncio.sleep(3) return goods_list
def __init__(self, logger=None): super().__init__() self.result_data = {} self.msg = '' if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/阿里1688/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger self.my_phantomjs = MyPhantomjs() # 可动态执行的代码 self._exec_code = ''' self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() sleep(1.5) # 向下滚动10000像素 js = 'document.body.scrollTop=10000' self.driver.execute_script(js) sleep(3) ''' self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS) - 1)], 'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'referer': 'https://detail.1688.com/offer/45579899125.html', } self.page_size = '30'
def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.comment_page_switch_sleep_time = 1.2 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs() self._add_headers_cookies()
def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.page_size = '10' self.comment_page_switch_sleep_time = 1.5 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs() self.g_data = {} # 临时数据 self.random_sku_info_list = [] # 临时数据(存该商品所有的规格)
def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'zhe800.com', 'User-Agent': HEADERS[randint(0, 34)] # 随机一个请求头 } self.my_phantomjs = MyPhantomjs()
def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'mobile.yangkeduo.com', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 # 'Cookie': 'api_uid=rBQh+FoXerAjQWaAEOcpAg==;', # 分析发现需要这个cookie值 } self.result_data = {} # self.set_cookies_key_api_uid() # 设置cookie中的api_uid的值 self.my_phantomjs = MyPhantomjs()
def __init__(self): super().__init__() self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '1688.com', 'User-Agent': HEADERS[randint(0, 34)] # 随机一个请求头 } self.result_data = {} self.is_activity_goods = False self.my_phantomjs = MyPhantomjs()
def __init__(self, logger=None): super().__init__() self.result_data = {} self.msg = '' self._set_headers() self._set_logger(logger) self.my_phantomjs = MyPhantomjs() # 可动态执行的代码 self._exec_code = ''' self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() _text = str(self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').text) print(_text) # if _text == '四五星(0)': assert _text != '四五星(0)', 'my assert error!' # 通过断言来跳过执行下面的代码 sleep(2.5) # 向下滚动10000像素 js = 'document.body.scrollTop=10000' self.driver.execute_script(js) sleep(4) ''' self._page_sleep_time = 1.2
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = MyPhantomjs() cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True
async def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id( logger=self.my_lg) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(result) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id( goods_id=item[0], logger=self.my_lg) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.my_lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.my_lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: my_phantomjs = MyPhantomjs() item_list = await jumeiyoupin_2.get_one_page_goods_list( my_phantomjs=my_phantomjs, tab=item[2], index=item[3]) try: del my_phantomjs except: pass if item_list == []: self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.my_lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=tmp_sql_server) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=tmp_sql_server) else: self.my_lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() return None
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} return {} print('------>>>| 对应的手机端地址为: ', 'https://m.chuchujie.com/details/detail.html?id=' + goods_id) ''' 1.原先直接去手机端页面api post请求数据但是死活就返回请求参数错误,反复研究无果, 就改为解析pc端的 ''' # tmp_url = 'https://api-product.chuchujie.com/api.php?method=product_detail' # self.headers['Referer'] = 'https://m.chuchujie.com/details/detail.html?id=' + str(goods_id) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # # params_2 = { # "channel": "QD_appstore", # "package_name": "com.culiukeji.huanletao", # "client_version": "3.9.101", # "ageGroup": "AG_0to24", # "client_type": "h5", # "api_version": "v5", # "imei": "", # "method": "product_detail", # "gender": "1", # 性别 0-女、1-男 # "token": "", # "userId": "", # "product_id": int(goods_id), # } # # params = { # 'data': json.dumps(params_2), # } # # try: # # response = requests.post( # # url=tmp_url, # # headers=self.headers, # # data=json.dumps(params), # # proxies=tmp_proxies, # # timeout=13 # # ) # response = requests.get( # url=tmp_url, # headers=self.headers, # params=params, # proxies=tmp_proxies, # timeout=13, # ) # last_url = re.compile(r'\+').sub('', response.url) # 转换后得到正确的url请求地址 # print(last_url) # print(tmp_url + '&data=%7B%22channel%22%3A%22QD_appstore%22%2C%22package_name%22%3A%22com.culiukeji.huanletao%22%2C%22client_version%22%3A%223.9.101%22%2C%22ageGroup%22%3A%22AG_0to24%22%2C%22client_type%22%3A%22h5%22%2C%22api_version%22%3A%22v5%22%2C%22imei%22%3A%22%22%2C%22method%22%3A%22product_detail%22%2C%22gender%22%3A%221%22%2C%22token%22%3A%22%22%2C%22userId%22%3A%22%22%2C%22product_id%22%3A10016793335%7D') # response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # # data = response.content.decode('utf-8') # print(data) # # except Exception: # print('requests.post()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 2. 改为解析pc端的商品页面数据 ''' tmp_url = 'http://wx.chuchujie.com/index.php?s=/WebProduct/product_detail/product_id/' + str( goods_id) # 开始常规requests有数据, 后面无数据, 改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) my_phantomjs = MyPhantomjs() body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) try: del my_phantomjs except: pass # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} data = {} try: data['title'] = Selector( text=body).css('div.zy_info_rt h3::text').extract_first() if data['title'] == '': print('title为空!') raise Exception data['sub_title'] = '' data['shop_name'] = Selector(text=body).css( 'div.other.ft14.clearfix label b::text').extract_first() # print(data['shop_name']) # 获取所有示例图片 all_img_url = [{ 'img_url': item } for item in list( Selector( text=body).css('p.s_img label img::attr("src")').extract()) ] # pprint(all_img_url) data['all_img_url'] = all_img_url ''' 获取p_info ''' # 由于获取的是pc端的对应没有p_info data['p_info'] = [] ''' 获取商品的div_desc ''' div_desc = Selector(text=body).css('div.s_two').extract_first() # print(div_desc) if div_desc == '': print('div_desc为空!请检查!') raise Exception data['div_desc'] = div_desc ''' 获取detail_name_list ''' detail_name_list = Selector(text=body).css( 'div.info-wd.bd-red dl.detail dt::text').extract() if len(detail_name_list) <= 1: detail_name_list = [] else: detail_name_list = [{ 'spec_name': item } for item in detail_name_list[:-1]] # print(detail_name_list) data['detail_name_list'] = detail_name_list # 商品价格(原价)跟淘宝价格 taobao_price = Selector( text=body).css('dl.detail p.price b::text').extract_first() price = Selector(text=body).css( 'dl.detail dd em.yjprice::text').extract_first() # print(taobao_price) # print(price) try: # 后面有'*' 是为了避免有价格为整数不是浮点类型的 taobao_price = re.compile(r'(\d+\.{0,1}\d*)').findall( taobao_price)[0] price = re.compile(r'(\d+\.{0,1}\d*)').findall(price)[0] except IndexError: print('获取price失败,请检查!') raise IndexError if taobao_price == '' or price == '': print('获取到的taobao_price或者price为空值出错, 请检查!') raise Exception taobao_price = Decimal(taobao_price).__round__(2) price = Decimal(price).__round__(2) # print('商品促销价为: ', taobao_price, ' 商品原价为: ', price) data['price'] = price data['taobao_price'] = taobao_price ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list(detail_name_list, body, price, taobao_price) # pprint(price_info_list) if price_info_list == '': raise Exception else: data['price_info_list'] = price_info_list ''' 是否卖光 ''' all_stock = int( Selector(text=body).css( 'dl.detail dd label em::text').extract_first()) if all_stock == 0: is_delete = 1 else: is_delete = 0 data['is_delete'] = is_delete except Exception as e: print('遇到错误: ', e) self.result_data = {} return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = MyPhantomjs() for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': self.timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': self.timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5)
def __init__(self): super(JuanPiParse, self).__init__() self._set_headers() self.result_data = {} self.my_phantomjs = MyPhantomjs()
def __init__(self): self._set_headers() self.my_phantomjs = MyPhantomjs()
async def get_goods_data(self, jumei_pintuan_url): ''' 异步模拟得到原始data :param goods_id: :return: ''' goods_id = await self.get_goods_id_from_url(jumei_pintuan_url) if goods_id == []: self.result_data = {} return {} ''' 原先采用requests被过滤无返回结果, 于是用aiohttp无奈速度过慢, 换用phantomjs ''' # 拼团商品手机地址 goods_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format( goods_id[0], goods_id[1]) self.msg = '------>>>| 对应手机端地址为: ' + goods_url self.my_lg.info(self.msg) #** 获取ajaxDetail请求中的数据 tmp_url = 'https://s.h5.jumei.com/yiqituan/ajaxDetail?item_id={0}&type={1}'.format( str(goods_id[0]), [goods_id[1]][0]) # self.headers['Referer'] = goods_url # params = { # 'item_id': str(goods_id[0]), # 'type': [goods_id[1]][0], # } # body = await MyAiohttp.aio_get_url_body(url=tmp_url, headers=self.headers, params=params, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT) # # 获取原始url的tmp_body # tmp_body = await MyAiohttp.aio_get_url_body(url=goods_url, headers=self.headers, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT) # # print(tmp_body) ''' 换用phantomjs ''' my_phantomjs = MyPhantomjs() body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) try: body = re.compile('<pre .*?>(.*)</pre>').findall(body)[0] # print(body) except IndexError: body = '' tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(url=goods_url) # print(tmp_body) try: del my_phantomjs except: pass if body == '' or tmp_body == '': self.msg = '获取到的body为空str!' + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} return {} data = await self.json_2_dict(json_str=body) if data == {}: self.msg = '出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} return {} data = await self.wash_data(data=data) data = data.get('data', {}) # pprint(data) try: data['title'] = data.get('share_info', [])[1].get('text', '') data['title'] = re.compile(r'聚美').sub('', data['title']) if len(data.get('buy_alone', {})) == 1: data['sub_title'] = '' else: data['sub_title'] = data.get('buy_alone', {}).get('name', '') data['sub_title'] = re.compile(r'聚美').sub( '', data['sub_title']) # print(data['title']) if data['title'] == '': self.my_lg.error('获取到的title为空值, 请检查!') raise Exception # shop_name if data.get('shop_info') == []: data['shop_name'] = '' else: data['shop_name'] = data.get('shop_info', {}).get('store_title', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = await self.get_all_img_url(data=data) data['all_img_url'] = all_img_url # 获取p_info p_info = await self.get_p_info(body=tmp_body) data['p_info'] = p_info # 获取div_desc div_desc = await self.get_div_desc(body=tmp_body) div_desc = await MyAiohttp.wash_html(div_desc) # print(div_desc) data['div_desc'] = div_desc # 上下架时间(拼团列表数据接口里面有这里先不获取) # 设置detail_name_list detail_name_list = await self.get_detail_name_list( size_attr=data.get('buy_alone', {}).get('size_attr', [])) data['detail_name_list'] = detail_name_list # 获取每个规格对应价格以及库存 true_sku_info = await self.get_true_sku_info( buy_alone_size=data.get('buy_alone', {}).get('size', []), size=data.get('size', []), group_single_price=data.get('group_single_price', '')) data['price_info_list'] = true_sku_info # is_delete product_status = data.get('product_status', '') is_delete = await self.get_is_delete(product_status=product_status, true_sku_info=true_sku_info) data['is_delete'] = is_delete # all_sell_count all_sell_count = data.get('buyer_number_text', '') if all_sell_count != '': all_sell_count = re.compile(r'(\d+\.?\d*)').findall( all_sell_count)[0] is_W = re.compile(r'万').findall(all_sell_count) if is_W != []: all_sell_count = str(int(float(all_sell_count) * 10000)) else: all_sell_count = '0' data['all_sell_count'] = all_sell_count data['goods_url'] = goods_url except Exception as e: self.msg = '遇到错误如下: ' + str(e) + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.my_lg.exception(e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != {}: # pprint(data) self.result_data = data return data else: self.msg = 'data为空!' + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_mogujie_pintuan_all_goods_id()) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs() for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server.delete_mogujie_pintuan_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server.delete_mogujie_pintuan_expired_goods_id( goods_id=item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': self.timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': self.timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # print(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server.delete_mogujie_pintuan_expired_goods_id(goods_id=item[0]) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = price_info_list # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [ { 'spec_value': item_4.get( 'spec_value'), 'pintuan_price': item_4.get( 'detail_price'), 'normal_price': item_4.get( 'normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get( 'rest_number'), } for item_4 in tmp_price_info_list ] goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = price_info_list goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] chuchujie = ChuChuJie_9_9_Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline.select_chuchujie_xianshimiaosha_all_goods_id()) ] # print(db_goods_id_list) # my_phantomjs = MyPhantomjs() # my_phantomjs.init_phantomjs() # index = 1 for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str( goods_id) chuchujie.get_goods_data(goods_id=goods_id) goods_data = chuchujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass elif goods_data.get('is_delete', 0) == 1: # is_delete=1(即库存为0)则跳过 print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 my_phantomjs = MyPhantomjs() my_phantomjs.init_phantomjs() # 获取剩余时间 tmp_body = my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='p#activityTime span') # print(tmp_body) try: del my_phantomjs except: pass gc.collect() if tmp_body == '': # 获取手机版的页面完整html失败 sleep(.4) pass else: # p#activityTime span _t = Selector(text=tmp_body).css( 'p#activityTime span::text').extract_first() _t = re.compile(r'剩余').sub('', _t) # print(_t) if _t == '' or _t is None: print('获取到的_t为空值, 严重错误! 请检查!') miaosha_end_time = self.get_miaosha_end_time(_t) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': self.timestamp_to_regulartime(int( time.time())), 'miaosha_end_time': self.timestamp_to_regulartime( int(miaosha_end_time)), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['gender'] = str(item.get('gender', '0')) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) chuchujie.insert_into_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) # sleep(CHUCHUJIE_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 # index += 1 else: print('数据库连接失败,此处跳过!') pass try: del chuchujie except: pass gc.collect()
def __init__(self): self._set_headers() self.result_data = {} # self.set_cookies_key_api_uid() # 设置cookie中的api_uid的值 self.my_phantomjs = MyPhantomjs()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server. select_jumeiyoupin_xianshimiaosha_all_goods_id()) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 jumeiyoupin_spike = JuMeiYouPinSpike() # 获取cookies my_phantomjs = MyPhantomjs() cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list( item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print( '#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id( item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(goods_id=item[0]) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url( item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get( 'begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = jumeiyoupin_spike.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time'] ) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def __init__(self): super().__init__() self._set_headers() self.result_data = {} self.is_activity_goods = False self.my_phantomjs = MyPhantomjs()