def test1(old_sku_info, new_sku_info): """ 测试: 规格, 价格, 库存都变动的情况! :return: """ is_spec_change = _get_spec_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_spec_change=0, old_spec_trans_time=get_shanghai_time())[0] is_price_change, _, price_change_info = _get_sku_price_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_price_change=0, db_price_change_info=[], old_price_trans_time=get_shanghai_time()) is_stock_change, _, stock_change_info = _get_stock_trans_record( old_sku_info=old_sku_info, new_sku_info=new_sku_info, is_stock_change=0, db_stock_change_info=[], old_stock_trans_time=get_shanghai_time()) print('规格变动: {}\n价格变动: {}\n库存变动: {}'.format(is_spec_change, is_price_change, is_stock_change)) pprint(price_change_info) pprint(stock_change_info) return
def just_fuck_run(): '''由于写成守护进程无法运行, 采用tmux模式运行, 设置采集时间点用以防止采集冲突''' _spider_run_time = ['00', '01', '02', '03',] while True: if str(get_shanghai_time())[11:13] in _spider_run_time: while True: if str(get_shanghai_time())[11:13] not in _spider_run_time: print('冲突时间点, 不抓取数据..., 上海时间%s' % str(get_shanghai_time())) sleep(60*5) break print('一次大抓取即将开始'.center(30, '-')) taobao_qianggou = TaoBaoQiangGou() loop = asyncio.get_event_loop() loop.run_until_complete(taobao_qianggou._deal_with_all_goods_id()) try: del taobao_qianggou loop.close() except: pass gc.collect() print('一次大抓取完毕, 即将重新开始'.center(30, '-')) restart_program() # 通过这个重启环境, 避免log重复打印 sleep(60*30) else: print('未在脚本运行时间点...休眠中, 上海时间%s' % str(get_shanghai_time())) sleep(60*2)
def turn_one_time() -> dict: cookies = { 'Hm_lpvt_fa0ddec29ac177a2d127cebe209832e3': str(datetime_to_timestamp(get_shanghai_time())), 'Hm_lvt_fa0ddec29ac177a2d127cebe209832e3': '1537161510,1537228200,1537353114,1537411854', # 定值 'wk_': '9umq63s8g6leobk2p285frmp583nhm9t', # 定值 } headers = { 'Host': 'm.riyiwk.com', 'accept': 'application/json, text/javascript, */*; q=0.01', 'origin': 'https://m.riyiwk.com', 'referer': 'https://m.riyiwk.com/lottery.html?check_login=1', 'accept-language': 'zh-cn', 'x-requested-with': 'XMLHttpRequest', 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Mobile/15A5341f/RIYIWK 2.6.0/USER_ID 203793/TOKEN 3a3988e07be98db064a70fc635c0b590', } url = 'https://m.riyiwk.com/lottery/start.html' res = json_2_dict( Requests.get_url_body(method='post', use_proxy=False, url=url, headers=headers, cookies=cookies)) # pprint(res) return res
async def _run_forever(self): ''' 实时更新所有数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=tb_select_str_4)) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is not None: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(str(result)) self.my_lg.info( '--------------------------------------------------------') self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) await self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result) else: pass if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: self.my_lg.info('休眠60s...') sleep(60) return
def _get_is_delete(self, price_info_list, data, other): ''' 获取is_delete :param price_info_list: :param data: :return: ''' is_delete = 0 all_rest_number = 0 if price_info_list != []: for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 # 当官方下架时间< 当前时间戳 则商品已下架 is_delete = 1 if data['sell_time'] != {}: end_time = datetime_to_timestamp( string_to_datetime( data.get('sell_time', {}).get('end_time', ''))) if end_time < datetime_to_timestamp(get_shanghai_time()): self.lg.info('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 # print(is_delete) if not other.get('sku_info', {}).get('goodsStoreStatus', True): is_delete = 1 return is_delete
def _get_is_delete(self, price_info_list, data, other): is_delete = 0 all_rest_number = 0 if price_info_list != []: for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 else: is_delete = 1 # 当官方下架时间< 当前时间戳 则商品已下架 is_delete = 1 if data['sell_time'] != {}: end_time = datetime_to_timestamp( string_to_datetime( data.get('sell_time', {}).get('end_time', ''))) if end_time < datetime_to_timestamp(get_shanghai_time()): self.my_lg.info('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 # print(is_delete) if other.get('soldOut'): # True or False is_delete = 1 return is_delete
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server._select_table(sql_str=z8_select_str_4)) tmp_sql_server._delete_table(sql_str=z8_delete_str_4, params=None) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(str(result)) print('--------------------------------------------------------') self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(10*60) return
async def _run_forever(self): ''' 实时更新所有数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select goods_id, miaosha_time, goods_url, page, spider_time from dbo.tao_qianggou_xianshimiaosha where site_id=28' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is not None: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(str(result)) self.my_lg.info( '--------------------------------------------------------') self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) await self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result) else: pass if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) return
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 return
def to_right_and_update_data(self, data, pipeline): ''' 实时更新数据 :param data: :param pipeline: :return: ''' data_list = data tmp = GoodsItem() tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modify_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 tmp['link_name'] = '' # 卖家姓名 tmp['account'] = data_list['account'] # 掌柜名称 tmp['all_sell_count'] = data_list['sell_count'] # 月销量 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['price_info'] = [] # 价格信息 tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 tmp['shelf_time'] = data_list.get('shelf_time', '') tmp['delete_time'] = data_list.get('delete_time', '') tmp['is_price_change'] = data_list.get('_is_price_change') tmp['price_change_info'] = data_list.get('_price_change_info') params = self._get_db_update_params(item=tmp) # 改价格的sql # sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, Price=%s, TaoBaoPrice=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s' # 不改价格的sql if tmp['delete_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s where GoodsID = %s' elif tmp['shelf_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, IsPriceChange=%s, PriceChangeInfo=%s, delete_time=%s where GoodsID = %s' else: sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s, delete_time=%s where GoodsID = %s' pipeline._update_table_2(sql_str=sql_str, params=params, logger=self.my_lg)
def _get_one_fund_info(self, fund_code): ''' 得到一只基金的info,并处理 :return: ''' cookies = { 'st_pvi': '11586003301354', 'st_si': '46806950936799', 'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs', 'EMFUND1': 'null', 'EMFUND0': 'null', # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884', # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106', # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301', # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723', # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595', # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594', # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148', # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', 'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092', } cookies = unquote_cookies(cookies) # pprint(cookies) headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/001092.html', 'Proxy-Connection': 'keep-alive', } v = re.compile(r'-| |:').sub('', str(get_shanghai_time())) # 2018-07-10 18:30:46 -> 20180710183046 # print(v) params = ( # ('v', '20180710175951'), # 时间 ('v', v), # 时间 ) fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format(fund_code) # response = requests.get(fund_url, headers=headers, params=params, cookies=None) # body = response.text # print(body) body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None) # print(body) self._get_this_fund_info(body=body) return True
def insert_into_mogujie_pintuan_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['goods_url'] # 商品地址 now_time = get_shanghai_time() tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price try: tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal( data_list['taobao_price']).__round__(2) except: print('此处抓到的可能是蜜芽拼团券所以跳过') return None tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['pintuan_time'] = data_list.get('pintuan_time') tmp['fcid'] = data_list.get('fcid') tmp['page'] = data_list.get('page') tmp['sort'] = data_list.get('sort') # 采集的来源地 tmp['site_id'] = 23 # 采集来源地(蘑菇街拼团商品) tmp['pintuan_begin_time'] = data_list.get('pintuan_begin_time') tmp['pintuan_end_time'] = data_list.get('pintuan_end_time') tmp['all_sell_count'] = data_list.get('all_sell_count') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_pintuan_params(item=tmp) sql_str = r'insert into dbo.mogujie_pintuan(goods_id, goods_url, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_Info, all_image_url, property_info, detail_info, miaosha_time, miaosha_begin_time, miaosha_end_time, all_sell_count, fcid, page, sort, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' _r = pipeline._insert_into_table(sql_str=sql_str, params=params) return _r
def _get_comment_data(self, goods_id): """ 获取comment数据 :param goods_id: :return: """ if goods_id == '': return self._data_error_init() self.lg.info('------>>>| 待抓取的goods_id: {}'.format(goods_id)) try: # db中已有的buyer_name and comment_date_list db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id( goods_id=goods_id, logger=self.lg, ) except SqlServerConnectionException: self.lg.error('db 连接异常! 此处抓取跳过!') return self._data_error_init() try: db_sku_info_list = _get_sku_info_from_db_by_goods_id( goods_id=goods_id, logger=self.lg, ) except DBGetGoodsSkuInfoErrorException: self.lg.error( '获取db goods_id: {} 的sku_info失败! 此处跳过!'.format(goods_id)) return self._data_error_init() # 同步 # all_comment_list = self._get_all_comment_info(goods_id=goods_id) # celery all_comment_list = self._get_all_comment_info_by_celery( goods_id=goods_id) # pprint(all_comment_list) try: _comment_list = self._get_comment_list( all_comment_list=all_comment_list, db_top_n_buyer_name_and_comment_date_list= db_top_n_buyer_name_and_comment_date_list, db_sku_info_list=db_sku_info_list) except Exception as e: self.lg.error('出错goods_id: ' + goods_id) self.lg.exception(e) return self._data_error_init() _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def main(): sleep_time = 60 * 20 while True: _hour = int(get_shanghai_time().hour) if _hour not in (9, ): print('不在服务时间, 休眠中...') sleep(sleep_time) else: print('--->>> {} 转动转盘一次'.format(str(get_shanghai_time()))) _ = turn_one_time() pprint(_) res = share_2_wx() if res: _ = turn_one_time() pprint(_) else: print('模拟第二次转动转盘失败!') sleep(sleep_time)
def oo(self, id, img_url): try: self.my_pipeline._update_table_2(sql_str=self.update_sql, params=(img_url, get_shanghai_time(), id), logger=self.my_lg) except Exception: return False return True
def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger
async def get_now_time_from_pytz(): ''' 得到log文件的时间名字 :return: 格式: 2016-03-25 类型datetime ''' _ = str(get_shanghai_time())[0:10].split('-') _ = datetime.datetime(year=int(_[0]), month=int(_[1]), day=int(_[2])) return _
def insert_into_zhe_800_xianshimiaosha_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['spider_url'] # 商品地址 tmp['username'] = data_list['username'] # 操作人员username now_time = get_shanghai_time() tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price try: tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal( data_list['taobao_price']).__round__(2) except: # 此处抓到的可能是折800秒杀券所以跳过 print('此处抓到的可能是折800秒杀券所以跳过') return None tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['stock_info'] = data_list.get('stock_info') tmp['miaosha_time'] = data_list.get('miaosha_time') tmp['session_id'] = data_list.get('session_id') # 采集的来源地 tmp['site_id'] = 14 # 采集来源地(折800秒杀商品) tmp['miaosha_begin_time'] = data_list.get('miaosha_begin_time') tmp['miaosha_end_time'] = data_list.get('miaosha_end_time') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_miaosha_params(item=tmp) sql_str = r'insert into dbo.zhe_800_xianshimiaosha(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_Info, all_image_url, property_info, detail_info, schedule, stock_info, miaosha_time, miaosha_begin_time, miaosha_end_time, session_id, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' pipeline._insert_into_table(sql_str=sql_str, params=params)
def parse_body(body): '''解析url body''' def _get_ip_type(ip_type): '''获取ip_type''' # return 'http' if ip_type == 'HTTP' else 'https' return 'http' # 全部返回'http' _ = [] parser_obj = parser_list[random_parser_list_item_index] try: part_selector = parser_obj.get('part', '') assert part_selector != '', '获取到part为空值!' position = parser_obj.get('position', {}) assert position != {}, '获取到position为空dict!' ip_selector = position.get('ip', '') assert ip_selector != '', '获取到ip_selector为空值!' port_selector = position.get('port', '') assert port_selector != '', '获取到port_selector为空值!' ip_type_selector = position.get('ip_type', '') assert ip_type_selector != '', '获取到ip_type_selector为空值!' except AssertionError: return [] for tr in Selector(text=body).css(part_selector).extract(): o = ProxyItem() try: ip = Selector(text=tr).css( '{} ::text'.format(ip_selector)).extract_first() if re.compile('\d+').findall(ip) == []: # 处理不是ip地址 continue assert ip != '', 'ip为空值!' port = Selector(text=tr).css( '{} ::text'.format(port_selector)).extract_first() assert port != '', 'port为空值!' ip_type = Selector(text=tr).css( '{} ::text'.format(ip_type_selector)).extract_first() assert ip_type != '', 'ip_type为空值!' ip_type = _get_ip_type(ip_type) except AssertionError or Exception: lg.error('遇到错误:', exc_info=True) continue o['ip'] = ip try: o['port'] = int(port) except Exception: lg.error('int转换port时出错!跳过!') continue o['ip_type'] = ip_type o['anonymity'] = 1 o['score'] = 100 o['last_check_time'] = str(get_shanghai_time()) # lg.info('[+] {}:{}'.format(ip, port)) _.append(o) return _
def old_ali_1688_goods_insert_into_new_table(self, data, pipeline): data_list = data tmp = GoodsItem() tmp['main_goods_id'] = data_list.get('main_goods_id') tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['goods_url'] = data_list['goods_url'] tmp['username'] = data_list['username'] now_time = get_shanghai_time() tmp['create_time'] = now_time # 操作时间 tmp['modify_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['company_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['link_name'] = data_list['link_name'] # 卖家姓名 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['price_info'] = data_list['price_info'] # 价格信息 # print(tmp['price'], print(tmp['taobao_price'])) # print(tmp['price_info']) spec_name = [] for item in data_list['sku_props']: tmp_dic = {} tmp_dic['spec_name'] = item.get('prop') spec_name.append(tmp_dic) tmp['detail_name_list'] = spec_name # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get('sku_map') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('property_info') # 详细信息 tmp['div_desc'] = data_list.get('detail_info') # 下方div tmp['site_id'] = 2 # 阿里1688 tmp['is_delete'] = data_list['is_delete'] # tmp['my_shelf_and_down_time'] = data_list.get('my_shelf_and_down_time') # tmp['delete_time'] = data_list.get('delete_time') # print('------>>> | 待存储的数据信息为: |', tmp) params = self._get_db_insert_params(item=tmp) if tmp.get('main_goods_id') is not None: sql_str = r'insert into dbo.GoodsInfoAutoGet(GoodsID, GoodsUrl, UserName, CreateTime, ModfiyTime, ShopName, GoodsName, LinkName, Price, TaoBaoPrice, PriceInfo, SKUName, SKUInfo, ImageUrl, DetailInfo, PropertyInfo, SiteID, IsDelete, MainGoodsID) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' else: sql_str = r'insert into dbo.GoodsInfoAutoGet(GoodsID, GoodsUrl, UserName, CreateTime, ModfiyTime, ShopName, GoodsName, LinkName, Price, TaoBaoPrice, PriceInfo, SKUName, SKUInfo, ImageUrl, DetailInfo, PropertyInfo, SiteID, IsDelete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' result = pipeline._insert_into_table(sql_str=sql_str, params=params) return result
def to_right_and_update_data(self, data, pipeline): ''' 实时更新数据 :param data: :param pipeline: :return: ''' data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 tmp['link_name'] = '' # 卖家姓名 tmp['account'] = data_list['account'] # 掌柜名称 tmp['month_sell_count'] = data_list['sell_count'] # 月销量 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['price_info'] = [] # 价格信息 tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div # # 采集的来源地 # if data_list.get('type') == 0: # tmp['site_id'] = 3 # 采集来源地(天猫) # elif data_list.get('type') == 1: # tmp['site_id'] = 4 # 采集来源地(天猫超市) # elif data_list.get('type') == 2: # tmp['site_id'] = 6 # 采集来源地(天猫国际) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 tmp['my_shelf_and_down_time'] = data_list.get('my_shelf_and_down_time') tmp['delete_time'] = data_list.get('delete_time') tmp['_is_price_change'] = data_list.get('_is_price_change') tmp['_price_change_info'] = data_list.get('_price_change_info') pipeline.update_tmall_table(tmp)
def update_mogujie_pintuan_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price try: tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal( data_list['taobao_price']).__round__(2) except: print('此处抓到的可能是蜜芽拼团券所以跳过') return None tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['pintuan_time'] = data_list.get('pintuan_time') # 采集的来源地 # tmp['site_id'] = 23 # 采集来源地(蘑菇街拼团商品) tmp['pintuan_begin_time'] = data_list.get('pintuan_begin_time') tmp['pintuan_end_time'] = data_list.get('pintuan_end_time') tmp['all_sell_count'] = data_list.get('all_sell_count') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_pintuan_params(item=tmp) sql_str = r'update dbo.mogujie_pintuan set modfiy_time = %s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_Info=%s, all_image_url=%s, property_info=%s, detail_info=%s, is_delete=%s, miaosha_time=%s, miaosha_begin_time=%s, miaosha_end_time=%s, all_sell_count=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params)
def _get_comment_data(self, goods_id): if goods_id == '': return self._data_error() self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) try: # db中已有的buyer_name and comment_date_list db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id( goods_id=goods_id, logger=self.lg, ) except SqlServerConnectionException: self.lg.error('db 连接异常! 此处抓取跳过!') return self._data_error() # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 4): try: _data = self._get_one_page_comment_info( goods_id=goods_id, page_num=current_page, ) except (AssertionError, Exception): self.lg.error('遇到错误:', exc_info=True) continue _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list, db_top_n_buyer_name_and_comment_date_list= db_top_n_buyer_name_and_comment_date_list, ) except Exception: self.lg.error('出错goods_id:{0}'.format(goods_id), exc_info=True) return self._data_error() _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_origin_comment_list(self, **kwargs) -> list: ''' 得到加密的接口数据信息 :param kwargs: :return: ''' csrf = kwargs.get('csrf', '') goods_id = kwargs.get('goods_id', '') cookies = kwargs.get('cookies', '') url = 'https://m.1688.com/page/offerRemark.htm' headers = { 'cookie': cookies, 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': 'application/json, text/javascript, */*; q=0.01', 'referer': 'https://m.1688.com/page/offerRemark.htm?offerId={}'.format(goods_id), 'authority': 'm.1688.com', 'x-requested-with': 'XMLHttpRequest', } origin_comment_list = [] for i in range(1, self.max_page): __wing_navigate_options = { 'data': { 'bizType': 'trade', 'itemId': int(goods_id), 'offerId': str(goods_id), 'page': i, 'pageSize': 5, # 'receiveUserId': 989036456, 'starLevel': 7 } } params = ( ('_csrf', csrf), ('__wing_navigate_type', 'view'), ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'), ('__wing_navigate_options', dumps(__wing_navigate_options)), ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))), ) body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) data = json_2_dict(body, encoding='ascii').get('data', {}) # pprint(data) one = data.get('model', []) pprint(one) origin_comment_list += one sleep(.25) return origin_comment_list
def run_one_file_name_list(path, file_name_list): for item in file_name_list: if item in night_run_file_name_list \ and str(get_shanghai_time())[11:13] not in night_run_time: print('{0}.py不在运行时间点...此处跳过!'.format(item)) pass else: process_name = item + '.py' if process_exit(process_name) == 0: # 如果对应的脚本没有在运行, 则运行之 os.system('cd {0} && python3 {1}.py'.format(path, item)) sleep(2.5) # 避免同时先后启动先sleep下 else: print(process_name + '脚本已存在!')
def auto_run(*params): print('开始执行秒杀脚本'.center(60, '*')) run_one_file_name_list(path=params[0], file_name_list=spike_file_name_list) run_one_file_name_list(path=params[1], file_name_list=pintuan_file_name_list) run_one_file_name_list(path=params[2], file_name_list=real_file_name_list) run_one_file_name_list(path=params[3], file_name_list=other_file_name_list) run_one_file_name_list(path=params[4], file_name_list=logs_file_name_list) run_one_file_name_list(path=params[5], file_name_list=zwm_file_name_list) if str(get_shanghai_time())[11:13] not in night_run_time: # kill冲突process [kill_process_by_name(process_name) for process_name in night_run_file_name_list] print('脚本执行完毕'.center(60, '*'))
def _get_is_delete(self, data, price_info_list): is_delete = 0 all_rest_number = 0 for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 # 当官方下架时间< int(time.time()) 则商品已下架 is_delete = 1 if int(data.get('sell_time', {}).get('end_time', '')) < int( datetime_to_timestamp(get_shanghai_time())): print('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 return is_delete
def _check_req_timestamp(self, req_timestamp): """ 校验时间戳 @pram req_timestamp str, int: 请求参数中的时间戳(10位) """ if len(str(req_timestamp)) == 10: req_timestamp = int(req_timestamp) self.now_timestamp = datetime_to_timestamp(get_shanghai_time()) if self.now_timestamp - req_timestamp == 28805: # 单独处理相差8 hour的请求(加拿大服务器问题) req_timestamp += 28805 if req_timestamp <= self.now_timestamp and req_timestamp + self._timestamp_expiration >= self.now_timestamp: return True return False
def _set_logger(self, logger): ''' 设置logger :param logger: :return: ''' if logger is None: self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) else: self.my_lg = logger
def insert_into_jd_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['spider_url'] # 商品地址 tmp['username'] = data_list['username'] # 操作人员username now_time = get_shanghai_time() tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 tmp['link_name'] = '' # 卖家姓名 tmp['account'] = data_list['account'] # 掌柜名称 tmp['all_sell_count'] = data_list['all_sell_count'] # 总销量 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['price_info'] = [] # 价格信息 tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get('price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['site_id'] = self._from_jd_type_get_site_id_value(jd_type=data_list.get('jd_type')) if tmp.get('site_id', 0) == 0: print('site_id获取异常, 请检查!') return False tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>>| 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: ', tmp.get('goods_id')) pipeline.insert_into_jd_table(item=tmp) return True