def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))), 'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))), } stock = item.get('stock', 0) # 卷皮商品的goods_id tmp['goods_id'] = item.get('goods_id') # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int(item.get('stock', 0) * (item.get('rate', 0) / 100)), 'stock': item.get('stock', 0), } # 原始价格 tmp['price'] = round(float(item.get('oprice', '0')), 2) tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2) miaosha_goods_list.append(tmp) return miaosha_goods_list
async def get_tiantiantejia_goods_list(self, data): ''' 将tmp_data转换为需求的list :param body: :return: a list ''' if data != []: # 处理得到需要的数据 try: tejia_goods_list = [{ 'goods_id': item.get('baseinfo', {}).get('itemId', ''), 'start_time': timestamp_to_regulartime( int(item.get('baseinfo', {}).get('ostime', '')[0:10])), 'end_time': timestamp_to_regulartime( int(item.get('baseinfo', {}).get('oetime', '')[0:10])), } for item in data] except Exception as e: self.my_lg.exception(e) tejia_goods_list = [] else: tejia_goods_list = [] return tejia_goods_list
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' pintuan_goods_id_list = [] for page in range(0, 100): tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format( str(page) ) print('正在抓取的页面地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if body == '': body = '{}' try: tmp_data = json.loads(body) tmp_data = tmp_data.get('data', {}).get('goods', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) sleep(.5) if tmp_data == []: print('该tmp_url得到的goods为空list, 此处跳过!') break tmp_pintuan_goods_id_list = [{ 'goods_id': item.get('goods_id', ''), 'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))), 'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))), 'all_sell_count': str(item.get('join_number_int', '')), 'page': page, } for item in tmp_data] # print(tmp_pintuan_goods_id_list) for item in tmp_pintuan_goods_id_list: if item.get('goods_id', '') not in [item2.get('goods_id', '') for item2 in pintuan_goods_id_list]: pintuan_goods_id_list.append(item) print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list)) print(pintuan_goods_id_list) return pintuan_goods_id_list
def _get_goods_schedule(self, data): ''' 获取商品销售时间段 :param data: :return: ''' # print(data.get('skudata', {}).get('info', {})) # print(data.get('skudata', {})) begin_time = data.get('skudata', {}).get('info', {}).get( 'start_time') # 取这个时间段才是正确的销售时间, 之前baseInfo是虚假的 end_time = data.get('skudata', {}).get('info', {}).get('end_time') if begin_time is None or end_time is None: schedule = [] else: schedule = [{ 'begin_time': timestamp_to_regulartime(begin_time), 'end_time': timestamp_to_regulartime(end_time), }] return schedule
def json_to_dict(self, tmp_data): try: data = json.loads(tmp_data) # pprint(data) times = [str(timestamp_to_regulartime(int(item))) for item in data.get('times', [])] data = data.get('items', []) # print(data) # print(times) except: print('json.loads转换data的时候出错,data为空') data = [] return data
def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: # pprint(item) tmp = {} # 秒杀开始时间和结束时间 tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int( str(item.get('begin_time'))[0:10])), 'miaosha_end_time': timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])), } # 折800商品地址 tmp['zid'] = item.get('zid') # 是否包邮 # tmp['is_baoyou'] = item.get('is_baoyou', 0) # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) # 子标题 tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list
def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: tmp = {} miaosha_begin_time = str(timestamp_to_regulartime(int(item.get('data', {}).get('start_time')))) tmp_hour = miaosha_begin_time[-8:-6] if tmp_hour in PINDUODUO_MIAOSHA_SPIDER_HOUR_LIST: if tmp_hour in PINDUODUO_MIAOSHA_BEGIN_HOUR_LIST: ''' # 这些起始的点秒杀时间只有30分钟 ''' miaosha_end_time = str(timestamp_to_regulartime(int(item.get('data', {}).get('start_time')) + 60*30)) else: miaosha_end_time = str(timestamp_to_regulartime(int(item.get('data', {}).get('start_time')) + 60*60)) tmp['miaosha_time'] = { 'miaosha_begin_time': miaosha_begin_time, 'miaosha_end_time': miaosha_end_time, } # 卷皮商品的goods_id tmp['goods_id'] = str(item.get('data', {}).get('goods_id')) # 限时秒杀库存信息 tmp['stock_info'] = { 'activity_stock': int(item.get('data', {}).get('all_quantity', 0) - item.get('data', {}).get('sold_quantity', 0)), 'stock': item.get('data', {}).get('all_quantity', 0), } # 原始价格 tmp['price'] = round(float(item.get('data', {}).get('normal_price', '0'))/100, 2) tmp['taobao_price'] = round(float(item.get('data', {}).get('price', '0'))/100, 2) miaosha_goods_list.append(tmp) else: pass return miaosha_goods_list
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = MyPhantomjs() for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid ) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime(timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime(self.get_pintuan_end_time(begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5)
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, event_time, goods_url from dbo.mogujie_xianshimiaosha where site_id=22' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mogujie_miaosha = MoGuJieMiaoShaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] item_list = self.get_item_list(event_time=str(item[2])) if item_list == '': # 可能网络状况导致, 先跳过 pass elif item_list == []: print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 该event_time中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('iid', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('iid', '') == item[0]: spider_url = item[3] mogujie_miaosha.get_goods_data( goods_id=spider_url) goods_data = mogujie_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) # price设置为原价 try: tmp_price_list = sorted([ round( float( item_4.get( 'normal_price', '')), 2) for item_4 in goods_data[ 'price_info_list'] ]) price = Decimal( tmp_price_list[-1] ).__round__(2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int( item_2.get( 'startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int( item_2.get( 'endTime', 0))), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # pprint(goods_data) # print(goods_data) mogujie_miaosha.update_mogujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, fcid, page from dbo.mogujie_pintuan where site_id=23' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs() for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # print(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'detail_price': '', 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = price_info_list # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [ { 'spec_value': item_4.get( 'spec_value'), 'pintuan_price': item_4.get( 'detail_price'), 'detail_price': '', 'normal_price': item_4.get( 'normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get( 'rest_number'), } for item_4 in tmp_price_info_list ] goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = price_info_list goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' pid = param[0] begin_time = int( time.mktime(time.strptime( param[1], '%Y/%m/%d %H:%M:%S'))) # 把str字符串类型转换为时间戳的形式 end_time = int( time.mktime(time.strptime(param[2], '%Y/%m/%d %H:%M:%S'))) item_list = param[3] mia = MiaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_xianshimiaosha where site_id=20' db_goods_id_list = [ item[0] for item in list(my_pipeline._select_table(sql_str=sql_str)) ] # print(db_goods_id_list) for item in item_list: if item.get('item_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('item_id', '')) tmp_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall( goods_url) != '': goods_url = 'https://www.miyabaobei.hk/item-' + str( goods_id) + '.html' else: goods_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['price'] = item.get('active_price') goods_data['taobao_price'] = item.get( 'active_price') # 秒杀最低价 goods_data['sub_title'] = item.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(begin_time), 'miaosha_end_time': timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['pid'] = str(pid) # pprint(goods_data) # print(goods_data) mia.insert_into_mia_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass gc.collect()
def deal_with_data(self): ''' 处理得到规范的data数据 :return: result 类型 dict ''' data = self.result_data if data != {}: # 店铺名称 shop_name = data['shop_name'] # 掌柜 account = '' # 商品名称 title = data['title'] # 子标题 sub_title = data['sub_title'] # 商品标签属性名称 detail_name_list = data['detail_name_list'] # 要存储的每个标签对应规格的价格及其库存 price_info_list = data['price_info_list'] # 所有示例图片地址 all_img_url = data['all_img_url'] # 详细信息标签名对应属性 p_info = data['p_info'] # pprint(p_info) # div_desc div_desc = data['div_desc'] ''' 用于判断商品是否已经下架 ''' is_delete = 0 all_rest_number = 0 for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 # 当官方下架时间< int(time.time()) 则商品已下架 is_delete = 1 if int(data.get('sell_time', {}).get('end_time', '')) < int( time.time()): print('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 # print(is_delete) # 上下架时间 schedule = [{ 'begin_time': timestamp_to_regulartime( int(data.get('sell_time', {}).get('begin_time', ''))), 'end_time': timestamp_to_regulartime( int(data.get('sell_time', {}).get('end_time', ''))), }] # 销售总量 all_sell_count = '' # 商品价格和淘宝价 # pprint(data['price_info_list']) try: tmp_price_list = sorted([ round(float(item.get('detail_price', '')), 2) for item in data['price_info_list'] ]) price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 except IndexError: print('获取price和taobao_price时出错, 请检查!' ) # 商品下架时, detail_price为空str, 所以会IndexError报错 print('@@@@@@ 此处对该商品进行逻辑删除! @@@@@@') self.result_data = {} price = 0. taobao_price = 0. is_delete = 1 # return {} result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list,# 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'schedule': schedule, # 商品特价销售时间段 'all_sell_count': all_sell_count, # 销售总量 'is_delete': is_delete # 用于判断商品是否已经下架 } # pprint(result) # print(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) self.result_data = {} return result else: print('待处理的data为空的dict, 该商品可能已经转移或者下架') self.result_data = {} return {}
def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] chuchujie = ChuChuJie_9_9_Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, gender, page, goods_url from dbo.chuchujie_xianshimiaosha where site_id=24' db_goods_id_list = [ item[0] for item in list(my_pipeline._select_table(sql_str=sql_str)) ] # print(db_goods_id_list) # my_phantomjs = MyPhantomjs() # my_phantomjs.init_phantomjs() # index = 1 for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str( goods_id) chuchujie.get_goods_data(goods_id=goods_id) goods_data = chuchujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass elif goods_data.get('is_delete', 0) == 1: # is_delete=1(即库存为0)则跳过 print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 my_phantomjs = MyPhantomjs() my_phantomjs.init_phantomjs() # 获取剩余时间 tmp_body = my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='p#activityTime span') # print(tmp_body) try: del my_phantomjs except: pass gc.collect() if tmp_body == '': # 获取手机版的页面完整html失败 sleep(.4) pass else: # p#activityTime span _t = Selector(text=tmp_body).css( 'p#activityTime span::text').extract_first() _t = re.compile(r'剩余').sub('', _t) # print(_t) if _t == '' or _t is None: print('获取到的_t为空值, 严重错误! 请检查!') miaosha_end_time = self.get_miaosha_end_time(_t) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(time.time())), 'miaosha_end_time': timestamp_to_regulartime( int(miaosha_end_time)), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['gender'] = str(item.get('gender', '0')) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) chuchujie.insert_into_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) # sleep(CHUCHUJIE_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 # index += 1 else: print('数据库连接失败,此处跳过!') pass try: del chuchujie except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_xianshimiaosha where site_id=20' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mia_miaosha = MiaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( item[2]) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '' or body == '[]': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') begin_time = tmp_data.get('p_info', {}).get( 'start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') begin_time = int( time.mktime( time.strptime(begin_time, '%Y/%m/%d %H:%M:%S')) ) # 把str字符串类型转换为时间戳的形式 end_time = int( time.mktime( time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('item_id', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('item_id', '') == item[0]: mia_miaosha.get_goods_data( goods_id=item[0]) goods_data = mia_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) goods_data['price'] = item_2.get( 'active_price') goods_data[ 'taobao_price'] = item_2.get( 'active_price') goods_data[ 'sub_title'] = item_2.get( 'short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( begin_time), 'miaosha_end_time': timestamp_to_regulartime( end_time), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # pprint(goods_data) # print(goods_data) mia_miaosha.update_mia_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' print(60 * '*') event_time = param[0] print('秒杀开始时间:', timestamp_to_regulartime(event_time), '\t', '对应时间戳为: ', event_time) print(60 * '*') item_list = param[1] mogujie = MoGuJieMiaoShaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, event_time, goods_url from dbo.mogujie_xianshimiaosha where site_id=22' db_goods_id_list = [ item[0] for item in list(my_pipeline._select_table(sql_str=sql_str)) ] # print(db_goods_id_list) for item in item_list: if item.get('iid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('iid', '')) tmp_url = item.get('link', '') try: object_id = re.compile(r'objectId=(.*?)&').findall( tmp_url)[0] except IndexError: # 表示匹配到的地址不是秒杀商品的地址 print('+++++++ 这个url不是秒杀的url: ', tmp_url) continue tmp_url = 'https://shop.mogujie.com/rushdetail/{0}?objectId={1}&type=rush'.format( goods_id, object_id) tmp_ = mogujie.get_goods_id_from_url(tmp_url) mogujie.get_goods_data(goods_id=tmp_) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) # price设置为原价 try: tmp_price_list = sorted([ round(float(item_4.get('normal_price', '')), 2) for item_4 in goods_data['price_info_list'] ]) price = Decimal(tmp_price_list[-1]).__round__( 2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int(item.get('startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int(item.get('endTime', 0))), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['event_time'] = str(event_time) # pprint(goods_data) # print(goods_data) mogujie.insert_into_mogujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect()
async def get_one_page_goods_list(self, **kwargs): ''' 获取单页面的goods_list :param kwargs: :return: item_list 类型list ''' my_phantomjs = kwargs.get('my_phantomjs') key = kwargs.get('key', '') tab = kwargs.get('tab', '') index = kwargs.get('index') i_time = time.time() tmp_url = 'http://s.h5.jumei.com/yiqituan/tab_list?tab={0}&page={1}&per_page=20'.format( tab, str(index)) # 常规requests被过滤, aiohttp成功, 测试发现:设置时间短抓取较快 # body = await MyAiohttp.aio_get_url_body(url=tmp_url, headers=self.headers, timeout=JUMEIYOUPIN_PINTUAN_API_TIMEOUT) # 改用phantomjs,aiohttp太慢 body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) try: body = re.compile('<pre .*?>(.*)</pre>').findall(body)[0] except: pass await asyncio.sleep(1) # self.my_lg.info(body) self.msg = '正在抓取第' + str(index) + '页...' + ' ☭ 用时: ' + str( time.time() - i_time) self.my_lg.info(self.msg) item_list = [] if body == '': self.msg = '获取到的body为空str!' + ' 出错地址: ' + tmp_url self.my_lg.error(self.msg) else: one_data = await self.json_2_dict(json_str=body) if one_data == {}: self.msg = '出错地址: ' + tmp_url self.my_lg.error(self.msg) else: if one_data.get('data', []) == []: pass else: tmp_item_list = one_data.get('data', []) for item in tmp_item_list: # 由于await 不能理解列表表达式,就采用常规做法 if item.get('status', '') != 'soldout': item_list.append({ 'goods_id': item.get('item_id', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( item.get('start_time', '0')), 'end_time': timestamp_to_regulartime( item.get('end_time', '0')), }, 'type': item.get('type', ''), 'sort': key, 'page': index, 'tab': tab, }) # self.my_lg.info(str(item_list)) return item_list
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data( base_session_id=base_session_id) sleep(.2) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 pass else: # 否则session_id存在 try: _ = str( data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10] if _ != '': pass elif data.get('data', {}).get('blocks', [])[0].get( 'showcase', {}) != {}: # 未来时间 print('*** 未来时间 ***') # pprint(data.get('data', {})) _ = str( data.get('data', {}).get('blocks', [])[1].get( 'deal', {}).get('begin_time', ''))[:10] else: raise Exception begin_times_timestamp = int( _) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 except Exception as e: print('遇到严重错误: ', e) continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) if self.is_recent_time( timestamp=begin_times_timestamp): # 说明秒杀日期合法 try: data = [ item_s.get('deal', {}) for item_s in data.get( 'data', {}).get('blocks', []) ] except Exception as e: print('遇到严重错误: ', e) continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14' db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=sql_str)) ] for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str( item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url( tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = str( item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get( 'taobao_price') goods_data['sub_title'] = item.get( 'sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['session_id'] = str( base_session_id) # print(goods_data['miaosha_time']) # print(goods_data) zhe_800.insert_into_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 # sleep(2) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') # return {} pass else: pass base_session_id += 2