def __init__(self): self._set_headers() self.ip_pool_type = IP_POOL_TYPE self.driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type, )
def _get_one_sort_type_name_page_info(self, sort_type_name): ''' 得到一个分类的某页信息 :return: ''' base_url = 'http://m.gx8899.com/{0}/'.format(sort_type_name) headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Referer': 'http://m.gx8899.com/weixin/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } index = 0 res = [] while True: if index == 0: url = base_url index += 1 # 第二页index_2开始 else: url = base_url + 'index_{0}.html'.format(index) self.lg.info('正在抓取{0}'.format(url)) # 太慢, 改用phantomjs # body = self._get_loop_run_result(url=url, headers=headers) if index % 15 == 0: try: del self.driver except: pass gc.collect() self.driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.lg, ip_pool_type=self.ip_pool_type) self.lg.info('[+] phantomjs已重置!') body = self.driver.get_url_body(url=url) # self.lg.info(str(body)) if re.compile(r'<title>404 - 找不到文件或目录。</title>').findall(body) != []: break need = Selector(text=body).css('div#con_tabone_1 li.last a:last-child ::attr(href)').extract() pprint(need) if need == []: self.lg.error('获取到的need为空list!出错地址:{0}'.format(url)) continue for article_url in need: _ = self._get_one_article_page_info(article_url) if _ != []: res += _ self.lg.info('#### 已更新{0}个id !'.format(self.update_index)) index += 1 return res
async def _get_cookies(self) -> str: ''' 获取请求需要的cookies :return: ''' # 获取cookies my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': self.lg.error('!!! 获取cookies失败 !!!') self.lg.info('获取cookies成功!') return cookies
async def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: ''' s_time = time.time() goods_list = [] driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg, ip_pool_type=self.ip_pool_type) for key in self.tab_dict: self.msg = '正在抓取的分类为: ' + key self.my_lg.info(self.msg) for index in range(1, 20): item_list = await self.get_one_page_goods_list( driver=driver, key=key, tab=self.tab_dict[key], index=index) all_goods_id = list( set([s.get('goods_id', '') for s in goods_list])) for item in item_list: if item.get('goods_id', '') not in all_goods_id: goods_list.append(item) # await asyncio.sleep(.5) # break # break try: del driver except: pass self.my_lg.info(str(goods_list)) self.my_lg.info('本次抓到所有拼团商品个数为: ' + str(len(goods_list))) e_time = time.time() self.my_lg.info('总用时:' + str(e_time - s_time)) await asyncio.sleep(3) return goods_list
def __init__(self): self._set_headers() self.ip_pool_type = IP_POOL_TYPE self.driver = BaseDriver(executable_path=EXECUTABLE_PATH, ip_pool_type=self.ip_pool_type)
class PinduoduoSpike(object): def __init__(self): self._set_headers() self.ip_pool_type = IP_POOL_TYPE self.driver = BaseDriver(executable_path=EXECUTABLE_PATH, ip_pool_type=self.ip_pool_type) def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'm.juanpi.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def _get_db_goods_id_list(self) -> list: my_pipeline = SqlServerMyPageInfoSaveItemPipeline() _ = my_pipeline._select_table(sql_str=pd_select_str_3) assert _ is not None, 'db_goods_id_list为None!' db_goods_id_list = [item[0] for item in list(_)] try: del my_pipeline except: pass return db_goods_id_list def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_miaosha_goods_list = self.get_all_miaosha_goods_list() try: del self.driver except: pass gc.collect() pinduoduo = PinduoduoParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: self.db_goods_id_list = self._get_db_goods_id_list() for item in all_miaosha_goods_list: ''' 注意: 明日8点半抓取到的是页面加载中返回的是空值 ''' if item.get('goods_id') != 'None': # 跳过goods_id为'None' if item.get('goods_id', '') in self.db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + item.get('goods_id') pinduoduo.get_goods_data(goods_id=item.get('goods_id')) goods_data = pinduoduo.deal_with_data() # print(goods_data) if goods_data == {}: # 返回的data为空则跳过 print('得到的goods_data为空值,此处先跳过,下次遍历再进行处理') # sleep(3) pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = item.get('goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get('taobao_price') # 秒杀价 goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) if item.get('stock_info', {}).get('activity_stock', 0) <= 2: # 实时秒杀库存小于等于2时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 pinduoduo.insert_into_pinduoduo_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(PINDUODUO_SLEEP_TIME) else: print('该goods_id为"None", 此处跳过') pass sleep(5) else: pass try: del pinduoduo except: pass gc.collect() def get_all_miaosha_goods_list(self): def get_data(body): '''处理返回的body''' _ = '{}' try: _ = re.compile(r'<body>(.*)</body>').findall(body)[0] except IndexError: print('获取all_miaosha_goods_list出现索引异常!') return _ # 今日秒杀 tmp_url = 'http://apiv4.yangkeduo.com/api/spike/v2/list/today?page=0&size=2000' print('待爬取的今日限时秒杀数据的地址为: ', tmp_url) today_data = get_data(body=self.driver.use_phantomjs_to_get_url_body(url=tmp_url)) today_data = self.json_to_dict(tmp_data=today_data) sleep(PINDUODUO_SLEEP_TIME) # 明日的秒杀 tmp_url_2 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/tomorrow?page=0&size=2000' print('待爬取的明日限时秒杀数据的地址为: ', tmp_url_2) tomorrow_data = get_data(body=self.driver.use_phantomjs_to_get_url_body(url=tmp_url_2)) tomorrow_data = self.json_to_dict(tmp_data=tomorrow_data) sleep(PINDUODUO_SLEEP_TIME) # 未来的秒杀 tmp_url_3 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/all_after?page=0&size=2000' print('待爬取的未来限时秒杀数据的地址为: ', tmp_url_3) all_after_data = get_data(body=self.driver.use_phantomjs_to_get_url_body(url=tmp_url_3)) all_after_data = self.json_to_dict(tmp_data=all_after_data) sleep(PINDUODUO_SLEEP_TIME) if today_data != []: today_miaosha_goods_list = self.get_miaoshao_goods_info_list(data=today_data) # print('今日限时秒杀的商品list为: ', today_miaosha_goods_list) else: today_miaosha_goods_list = [] print('今日秒杀的items为[]') if tomorrow_data != []: tomorrow_miaosha_goods_list = self.get_miaoshao_goods_info_list(data=tomorrow_data) # print('明日限时秒杀的商品list为: ', tomorrow_miaosha_goods_list) else: tomorrow_miaosha_goods_list = [] print('明日秒杀的items为[]') if all_after_data != []: all_after_miaosha_goods_list = self.get_miaoshao_goods_info_list(data=all_after_data) # print('未来限时秒杀的商品list为: ', all_after_miaosha_goods_list) else: all_after_miaosha_goods_list = [] print('未来秒杀的items为[]') all_miaosha_goods_list = today_miaosha_goods_list for item in tomorrow_miaosha_goods_list: all_miaosha_goods_list.append(item) for item in all_after_miaosha_goods_list: all_miaosha_goods_list.append(item) print('当前所有限时秒杀商品list为: ', all_miaosha_goods_list) return all_miaosha_goods_list def json_to_dict(self, tmp_data): try: data = json.loads(tmp_data) # pprint(data) times = [str(timestamp_to_regulartime(int(item))) for item in data.get('times', [])] data = data.get('items', []) # print(data) # print(times) except: print('json.loads转换data的时候出错,data为空') data = [] return data def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} miaosha_begin_time = str(timestamp_to_regulartime(int(item.get('data', {}).get('start_time')))) tmp_hour = miaosha_begin_time[-8:-6] if tmp_hour in PINDUODUO_MIAOSHA_SPIDER_HOUR_LIST: if tmp_hour in PINDUODUO_MIAOSHA_BEGIN_HOUR_LIST: ''' # 这些起始的点秒杀时间只有30分钟 ''' miaosha_end_time = str(timestamp_to_regulartime(int(item.get('data', {}).get('start_time')) + 60*30)) else: miaosha_end_time = str(timestamp_to_regulartime(int(item.get('data', {}).get('start_time')) + 60*60)) tmp['miaosha_time'] = { 'miaosha_begin_time': miaosha_begin_time, 'miaosha_end_time': miaosha_end_time, } # 卷皮商品的goods_id tmp['goods_id'] = str(item.get('data', {}).get('goods_id')) # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int(item.get('data', {}).get('all_quantity', 0) - item.get('data', {}).get('sold_quantity', 0)), 'stock': item.get('data', {}).get('all_quantity', 0), } # 原始价格 tmp['price'] = round(float(item.get('data', {}).get('normal_price', '0'))/100, 2) tmp['taobao_price'] = round(float(item.get('data', {}).get('price', '0'))/100, 2) miaosha_goods_list.append(tmp) else: pass return miaosha_goods_list def __del__(self): try: del self.driver except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) json_body = json_2_dict(Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type), default_res={}) # print(json_body) this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) json_body = json_2_dict(Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type), default_res={}) # print(json_body) this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True
class JuMeiYouPinSpike(object): def __init__(self): self._set_headers() self.ip_pool_type = IP_POOL_TYPE def _set_headers(self): self.headers = { 'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01', # 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'h5.jumei.com', 'Referer': 'https://h5.jumei.com/', 'Cache-Control': 'max-age=0', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) json_body = json_2_dict(Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type), default_res={}) # print(json_body) this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) json_body = json_2_dict(Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type), default_res={}) # print(json_body) this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=jm_select_str_2)) db_goods_id_list = [item[0] for item in _] # print(db_goods_id_list) for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: jumei = JuMeiYouPinParse() goods_id = item.get('goods_id', '') type = item.get('type', '') tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format( goods_id, type) jumei.get_goods_data(goods_id=[goods_id, type]) goods_data = jumei.deal_with_data() if goods_data == {}: pass elif goods_data.get('is_delete', 0) == 1: print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['page'] = item.get('page') # pprint(goods_data) res = jumei.insert_into_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(JUMEIYOUPIN_SLEEP_TIME ) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 try: del jumei except: pass else: print('数据库连接失败,此处跳过!') pass gc.collect() def __del__(self): gc.collect()
async def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id( logger=self.my_lg) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(result) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id( goods_id=item[0], logger=self.my_lg) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.my_lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.my_lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=IP_POOL_TYPE) item_list = await jumeiyoupin_2.get_one_page_goods_list( driver=driver, tab=item[2], index=item[3]) try: del driver except: pass if item_list == []: self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.my_lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=tmp_sql_server) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=tmp_sql_server) else: self.my_lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() return None
async def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=jm_delete_str_3, ) await async_sleep(5) result = sql_cli._select_table(sql_str=jm_select_str_3, logger=self.lg) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: await _print_db_old_data(result=result, logger=self.lg) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} sql_cli = await _get_new_db_conn(db_obj=sql_cli, index=index, logger=self.lg, remainder=50) if sql_cli.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await sql_cli._update_table_3( sql_str=jm_update_str_5, params=(str(get_shanghai_time()), item[0]), logger=self.lg) await async_sleep(.5) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) item_list = await jumeiyoupin_2.get_one_page_goods_list( driver=driver, tab=item[2], index=item[3]) try: del driver except: pass if item_list == []: self.lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=sql_cli) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=sql_cli) else: self.lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(10 * 60) gc.collect() return None
def __init__(self): self.headers = self._get_pc_headers() self.ip_pool_type = IP_POOL_TYPE self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type)
class Zhe800Spike(object): def __init__(self): self.headers = self._get_pc_headers() self.ip_pool_type = IP_POOL_TYPE self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) @staticmethod def _get_pc_headers(): return { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'zhe800.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } @staticmethod def _get_begin_times_timestamp(data) -> int: _ = str(data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10] if _ != '': pass elif data.get('data', {}).get('blocks', [])[0].get('showcase', {}) != {}: # 未来时间 print('*** 未来时间 ***') # pprint(data.get('data', {})) _ = str(data.get('data', {}).get('blocks', [])[1].get('deal', {}).get('begin_time', ''))[:10] else: raise Exception begin_times_timestamp = int(_) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 return begin_times_timestamp def _get_db_goods_id_list(self, my_pipeline) -> list: _ = list(my_pipeline._select_table(sql_str=z8_select_str_5)) db_goods_id_list = [item[0] for item in _] return db_goods_id_list def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data(base_session_id=base_session_id) sleep(.5) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 base_session_id += 2 continue try: begin_times_timestamp = self._get_begin_times_timestamp(data) except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) is_recent_time = self.is_recent_time(timestamp=begin_times_timestamp) if not is_recent_time: # 说明秒杀日期合法 base_session_id += 2 continue try: data = [item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', [])] except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = self._get_db_goods_id_list(my_pipeline) for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str(item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url(tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = str(item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get('taobao_price') goods_data['sub_title'] = item.get('sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) goods_data['session_id'] = str(base_session_id) # print(goods_data) res = zhe_800.insert_into_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 sleep(4) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') pass base_session_id += 2 def _get_one_session_id_data(self, base_session_id) -> dict: ''' 得到一个session_id的data :param base_session_id: :return: ''' _data = [] for _page in range(1, 20): '''per_page为20固定,其他不返回数据''' tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page={1}&per_page=20'.format( str(base_session_id), _page, ) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url,) # print(body) try: data = json_2_dict(re.compile(r'<pre.*?>(.*)</pre>').findall(body)[0], default_res={}) # pprint(data) except (IndexError, Exception): sleep(.3) continue # print(type(data.get('data', {}).get('has_next'))) if data.get('msg', '') == '无效场次': print('该session_id不存在,此处跳过') break if not data.get('data', {}).get('has_next', True): print('该session_id没有下页了!!') break else: print('正在抓取该session_id的第 {0} 页...'.format(_page)) for _i in data.get('data', {}).get('blocks', []): _data.append(_i) sleep(.3) return { 'data': { 'blocks': _data, } } def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: # pprint(item) tmp = {} # 秒杀开始时间和结束时间 try: tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(str(item.get('begin_time'))[:10])), 'miaosha_end_time': timestamp_to_regulartime(int(str(item.get('end_time'))[:10])), } except ValueError: continue # 折800商品地址 tmp['zid'] = item.get('zid') # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } tmp['price'] = float(item.get('list_price')) tmp['taobao_price'] = float(item.get('price')) tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: True or False ''' time_1 = int(timestamp) time_2 = time.time() # 当前的时间戳 time_1 = time.localtime(time_1) time_2 = time.localtime(time_2) if time_1.tm_year > time_2.tm_year: print('** 该年份为未来时间年份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_year == time_2.tm_year: if time_1.tm_mon > time_2.tm_mon: # 先处理得到的time_1的月份大于当前月份的信息(即未来月份的) print('** 该月份为未来时间月份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_mon >= time_2.tm_mon: # 如果目标时间的月份时间 >= 当前月份(月份合法, 表示是当前月份或者是今年其他月份) if time_1.tm_mday >= time_2.tm_mday-2: # 这样能抓到今天的前两天的信息 if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False else: print('该日时间已过期, 此处跳过') return False else: # 月份过期 print('该月份时间已过期,此处跳过') return False else: print('非本年度的限时秒杀时间,此处跳过') return False def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class Pinduoduo_Miaosha_Real_Time_Update(object): def __init__(self): self._set_headers() self.delete_sql_str = pd_delete_str_1 self.ip_pool_type = IP_POOL_TYPE self.driver = BaseDriver(executable_path=EXECUTABLE_PATH, ip_pool_type=self.ip_pool_type) def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'm.juanpi.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def run_forever(self): ''' 这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=pd_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 pinduoduo_miaosha = PinduoduoParse() all_miaosha_goods_list = self.get_all_miaosha_goods_list() # 其中所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in all_miaosha_goods_list ] # print(miaosha_goods_all_goods_id) for item in result: # 实时更新数据 # 对于拼多多先拿到该商品的结束时间点 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) sleep(.3) elif self.is_recent_time(miaosha_end_time) == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示其中没有了该goods_id ''' sql_cli._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) sleep(.3) else: # 未下架的 for item_1 in all_miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # pinduoduo_miaosha = PinduoduoParse() pinduoduo_miaosha.get_goods_data( goods_id=item[0]) goods_data = pinduoduo_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 # sleep(3) pass else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get( 'stock_info') goods_data['goods_id'] = item_1.get( 'goods_id') if item_1.get('stock_info').get( 'activity_stock') > 0: goods_data['price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get( 'miaosha_time')) if item_1.get('stock_info').get( 'activity_stock') <= 1: # 实时秒杀库存小于等于1时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 # print(goods_data) pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table( data=goods_data, pipeline=sql_cli) sleep(PINDUODUO_SLEEP_TIME) else: pass index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(3 * 60) # del ali_1688 gc.collect() def get_all_miaosha_goods_list(self): def get_data(body): '''处理返回的body''' _ = '{}' try: _ = re.compile(r'<body>(.*)</body>').findall(body)[0] except IndexError: print('获取all_miaosha_goods_list出现索引异常!') return _ # 今日秒杀 tmp_url = 'http://apiv4.yangkeduo.com/api/spike/v2/list/today?page=0&size=2000' # print('待爬取的今日限时秒杀数据的地址为: ', tmp_url) today_data = get_data(body=self.driver.use_phantomjs_to_get_url_body( url=tmp_url)) today_data = self.json_to_dict(tmp_data=today_data) # 明日的秒杀 tmp_url_2 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/tomorrow?page=0&size=2000' # print('待爬取的明日限时秒杀数据的地址为: ', tmp_url_2) tomorrow_data = get_data( body=self.driver.use_phantomjs_to_get_url_body(url=tmp_url_2)) tomorrow_data = self.json_to_dict(tmp_data=tomorrow_data) # 未来的秒杀 tmp_url_3 = 'http://apiv4.yangkeduo.com/api/spike/v2/list/all_after?page=0&size=2000' # print('待爬取的未来限时秒杀数据的地址为: ', tmp_url_3) all_after_data = get_data( body=self.driver.use_phantomjs_to_get_url_body(url=tmp_url_3)) all_after_data = self.json_to_dict(tmp_data=all_after_data) if today_data != []: today_miaosha_goods_list = self.get_miaoshao_goods_info_list( data=today_data) # print('今日限时秒杀的商品list为: ', today_miaosha_goods_list) else: today_miaosha_goods_list = [] print('今日秒杀的items为[]') if tomorrow_data != []: tomorrow_miaosha_goods_list = self.get_miaoshao_goods_info_list( data=tomorrow_data) # print('明日限时秒杀的商品list为: ', tomorrow_miaosha_goods_list) else: tomorrow_miaosha_goods_list = [] print('明日秒杀的items为[]') if all_after_data != []: all_after_miaosha_goods_list = self.get_miaoshao_goods_info_list( data=all_after_data) # print('未来限时秒杀的商品list为: ', all_after_miaosha_goods_list) else: all_after_miaosha_goods_list = [] print('未来秒杀的items为[]') all_miaosha_goods_list = today_miaosha_goods_list for item in tomorrow_miaosha_goods_list: all_miaosha_goods_list.append(item) for item in all_after_miaosha_goods_list: all_miaosha_goods_list.append(item) # print('当前所有限时秒杀商品list为: ', all_miaosha_goods_list) return all_miaosha_goods_list def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} miaosha_begin_time = str( timestamp_to_regulartime( int(item.get('data', {}).get('start_time')))) tmp_hour = miaosha_begin_time[-8:-6] if tmp_hour in PINDUODUO_MIAOSHA_SPIDER_HOUR_LIST: if tmp_hour in PINDUODUO_MIAOSHA_BEGIN_HOUR_LIST: ''' # 这些起始的点秒杀时间只有30分钟 ''' miaosha_end_time = str( timestamp_to_regulartime( int(item.get('data', {}).get('start_time')) + 60 * 30)) else: miaosha_end_time = str( timestamp_to_regulartime( int(item.get('data', {}).get('start_time')) + 60 * 60)) tmp['miaosha_time'] = { 'miaosha_begin_time': miaosha_begin_time, 'miaosha_end_time': miaosha_end_time, } # 卷皮商品的goods_id tmp['goods_id'] = str(item.get('data', {}).get('goods_id')) # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int( item.get('data', {}).get('all_quantity', 0) - item.get('data', {}).get('sold_quantity', 0)), 'stock': item.get('data', {}).get('all_quantity', 0), } # 原始价格 tmp['price'] = round( float(item.get('data', {}).get('normal_price', '0')) / 100, 2) tmp['taobao_price'] = round( float(item.get('data', {}).get('price', '0')) / 100, 2) miaosha_goods_list.append(tmp) else: pass return miaosha_goods_list def json_to_dict(self, tmp_data): try: data = json.loads(tmp_data) # pprint(data) times = [ str(timestamp_to_regulartime(int(item))) for item in data.get('times', []) ] data = data.get('items', []) # print(data) # print(times) except: print('json.loads转换data的时候出错,data为空') data = [] return data def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestr: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time <= -86400: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time <= 0: return 0 # 已过期恢复原价的 elif diff_time > 0 and diff_time <= 7200: # 未来2小时的 return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 def __del__(self): try: del self.driver except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=mg_delete_str_2) result = list(sql_cli._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for item in result: # 实时更新数据 goods_id = item[0] pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) print( '过期的goods_id为(%s)' % goods_id, ', 拼团开始时间为(%s), 逻辑删除成功!' % json.loads(item[1]).get('begin_time')) sleep(.3) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) sleep(.3) else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if goods_id not in pintuan_goods_all_goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=sql_cli) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data[ 'goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=sql_cli) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) gc.collect()
class MoGuJiePinTuanRealTimesUpdate(object): def __init__(self): self._set_headers() self.ip_pool_type = IP_POOL_TYPE def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Accept-Encoding:': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'list.mogujie.com', # 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=mg_delete_str_2) result = list(sql_cli._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for item in result: # 实时更新数据 goods_id = item[0] pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) print( '过期的goods_id为(%s)' % goods_id, ', 拼团开始时间为(%s), 逻辑删除成功!' % json.loads(item[1]).get('begin_time')) sleep(.3) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) sleep(.3) else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if goods_id not in pintuan_goods_all_goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=sql_cli) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data[ 'goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=sql_cli) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) gc.collect() def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(datetime_to_timestamp(get_shanghai_time())) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=jm_delete_str_2) result = list(tmp_sql_server._select_table(sql_str=jm_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 获取cookies my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list(item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print('#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] chuchujie = ChuChuJie_9_9_Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=cc_select_str_2)) db_goods_id_list = [item[0] for item in _] # print(db_goods_id_list) for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str( goods_id) chuchujie.get_goods_data(goods_id=goods_id) goods_data = chuchujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 sleep(.5) elif goods_data.get('is_delete', 0) == 1: # is_delete=1(即库存为0)则跳过 print('------>>>| 该商品库存为0,已被抢光!') sleep(.5) else: # 否则就解析并且插入 my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) # 获取剩余时间 tmp_body = my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='p#activityTime span') # print(tmp_body) try: del my_phantomjs except: pass gc.collect() if tmp_body == '': # 获取手机版的页面完整html失败 sleep(.5) pass else: # p#activityTime span _t = Selector(text=tmp_body).css( 'p#activityTime span::text').extract_first() _t = re.compile(r'剩余').sub('', _t) # print(_t) if _t == '' or _t is None: print('获取到的_t为空值, 严重错误! 请检查!') miaosha_end_time = self.get_miaosha_end_time(_t) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(time.time())), 'miaosha_end_time': timestamp_to_regulartime( int(miaosha_end_time)), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['gender'] = str(item.get('gender', '0')) goods_data['page'] = item.get('page') res = chuchujie.insert_into_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) # sleep(CHUCHUJIE_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 # index += 1 else: print('数据库连接失败,此处跳过!') pass try: del chuchujie except: pass gc.collect()
class MoGuJiePinTuan(object): def __init__(self): self._set_headers() self._set_fcid_dict() self.ip_pool_type = IP_POOL_TYPE def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'api.mogujie.com', 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def _set_fcid_dict(self): self.fcid_dict = { '女装': 10053171, # '精选': 10053172, '男友': 10053173, '内衣': 10053174, '女鞋': 10053175, '包包': 10053176, '美妆': 10053177, '生活': 10053178, '配饰': 10053179, '母婴': 10053180, '食品': 10053181, } def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5) def deal_with_data(self, *params): ''' 处理并存储相关拼团商品的数据 :param params: 待传参数 :return: ''' goods_list = params[0] mogujie = MoGuJieParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mg_select_str_1)) db_goods_id_list = [item[0] for item in _] print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://shop.mogujie.com/detail/' + str( goods_id) mogujie.get_goods_data(goods_id=str(goods_id)) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 # 规范化 goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get('pintuan_time', {})) goods_data['all_sell_count'] = item.get( 'all_sell_count', '') goods_data['fcid'] = str(item.get('fcid')) goods_data['page'] = str(item.get('page')) goods_data['sort'] = str(item.get('sort', '')) # pprint(goods_data) # print(goods_data) _r = mogujie.insert_into_mogujie_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect() def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class GX8899Spider(Crawler): def __init__(self, logger=None): super(GX8899Spider, self).__init__( ip_pool_type=IP_POOL_TYPE, log_print=True, logger=logger, log_save_path=MY_SPIDER_LOGS_PATH + '/gx8899/_/', is_use_driver=True, driver_executable_path=PHANTOMJS_DRIVER_PATH ) self._set_sort_type_name() self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s' self.id_list = [] self.update_index = 0 def _set_sort_type_name(self): ''' 设置抓取的分类名 :return: ''' self.sort_type_name_list = [ # 'weixin', # 'nansheng', # 'nvsheng', 'fengjing', 'jingxuan', 'wupin', 'oumei', 'weimei', 'heibai', 'baqi', 'xiaoqingxin', 'yijing', 'beiying', 'chouyan', 'sumiao', 'gexing', 'xiaohai', 'qiche', 'zhiwu', 'shouhui', 'weshen', 'mingxing', 'jianzhu', 'renwu', ] def _get_gx8899_all_img_url(self): self.lg.info('即将开始采集gx8899...') fz = [] for sort_type_name in self.sort_type_name_list: tmp = self._get_one_sort_type_name_page_info(sort_type_name) if tmp != []: fz += tmp self.lg.info('@@@ 全部头像抓取完毕!') self.fz = fz return fz def _get_new_wait_2_handle_id_list(self): ''' 获取新的带处理的 :return: ''' sql_str = ''' select top 1000 id from dbo.sina_weibo where sina_type = 'bilibili' and modify_time is null ''' if self.id_list == []: self.lg.info('@@@ 重新获取id_list...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: wait = self.my_pipeline._select_table(sql_str=sql_str) self.id_list = [i[0] for i in wait] except TypeError or IndexError: sleep(8) return [] else: pass return self.id_list @fz_set_timeout(6) def oo(self, id, img_url): try: self.my_pipeline._update_table_2( sql_str=self.update_sql, params=(img_url, get_shanghai_time(), id), logger=self.lg ) except Exception: return False return True def _get_one_sort_type_name_page_info(self, sort_type_name): ''' 得到一个分类的某页信息 :return: ''' base_url = 'http://m.gx8899.com/{0}/'.format(sort_type_name) headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Referer': 'http://m.gx8899.com/weixin/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } index = 0 res = [] while True: if index == 0: url = base_url index += 1 # 第二页index_2开始 else: url = base_url + 'index_{0}.html'.format(index) self.lg.info('正在抓取{0}'.format(url)) # 太慢, 改用phantomjs # body = self._get_loop_run_result(url=url, headers=headers) if index % 15 == 0: try: del self.driver except: pass gc.collect() self.driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.lg, ip_pool_type=self.ip_pool_type) self.lg.info('[+] phantomjs已重置!') body = self.driver.get_url_body(url=url) # self.lg.info(str(body)) if re.compile(r'<title>404 - 找不到文件或目录。</title>').findall(body) != []: break need = Selector(text=body).css('div#con_tabone_1 li.last a:last-child ::attr(href)').extract() pprint(need) if need == []: self.lg.error('获取到的need为空list!出错地址:{0}'.format(url)) continue for article_url in need: _ = self._get_one_article_page_info(article_url) if _ != []: res += _ self.lg.info('#### 已更新{0}个id !'.format(self.update_index)) index += 1 return res def _get_one_article_page_info(self, url): ''' 得到一个推荐地址里面所有图片list :param url: :return: ''' headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } # body = self._get_loop_run_result(url=url, headers=headers) body = self.driver.get_url_body(url=url) if body == '': self.lg.info('获取到img list为空list!出错地址:{}'.format(url)) return [] need = Selector(text=body).css('div.content p img ::attr(src)').extract() # pprint(need) # self.lg.info(str(need)) if need != []: self.lg.info('[+] crawl子地址success') else: self.lg.info('[-] crawl子地址fail') # 数据更新操作 for img_url in need: try: random_id_index = randint(0, len(self._get_new_wait_2_handle_id_list())-1) except: sleep(5) continue res = self.oo( id=self.id_list[random_id_index], img_url=img_url, ) if res: self.id_list.pop(random_id_index) self.update_index += 1 return need async def _get_one_page_body(self, url, headers): ''' 异步获取body :param url: :param headers: :return: ''' body = await AioHttp.aio_get_url_body(url=url, headers=headers, ip_pool_type=self.ip_pool_type) return body def _get_loop_run_result(self, **kwargs): loop = get_event_loop() result = loop.run_until_complete(self._get_one_page_body( url=kwargs.get('url', ''), headers=kwargs.get('headers', {}) )) return result def __del__(self): try: del self.driver del self.lg except: pass gc.collect()
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5)