def _get_is_delete(self, price_info_list, data, other): is_delete = 0 all_rest_number = 0 if price_info_list != []: for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 else: is_delete = 1 # 当官方下架时间< 当前时间戳 则商品已下架 is_delete = 1 if data['sell_time'] != {}: end_time = datetime_to_timestamp( string_to_datetime( data.get('sell_time', {}).get('end_time', ''))) if end_time < datetime_to_timestamp(get_shanghai_time()): self.my_lg.info('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 # print(is_delete) if other.get('soldOut'): # True or False is_delete = 1 return is_delete
def _get_is_delete(self, price_info_list, data, other): ''' 获取is_delete :param price_info_list: :param data: :return: ''' is_delete = 0 all_rest_number = 0 if price_info_list != []: for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 # 当官方下架时间< 当前时间戳 则商品已下架 is_delete = 1 if data['sell_time'] != {}: end_time = datetime_to_timestamp( string_to_datetime( data.get('sell_time', {}).get('end_time', ''))) if end_time < datetime_to_timestamp(get_shanghai_time()): self.lg.info('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 # print(is_delete) if not other.get('sku_info', {}).get('goodsStoreStatus', True): is_delete = 1 return is_delete
def _get_x_axis_label(self, x): ''' 得到x轴的刻度list :param x: :return: list ''' now_time = datetime.datetime.now() x_axis_label = [] for _x in x: if _x is not None and month_differ(now_time, string_to_datetime(_x)) % 6 == 0: if str(_x)[:7] in x_axis_label: # 如果已存在append('') x_axis_label.append('') else: x_axis_label.append(str(_x)[:7]) else: x_axis_label.append('') return x_axis_label
def _get_append_comment_date(self, append_comment_date, comment_date) -> str: """ 获取并处理追评时间点 :return: """ # 处理append_comment_date值为n天后追评, or 当天追评 if re.compile('追评').findall(append_comment_date) != []: add_days = re.compile('(\d+)天后').findall(append_comment_date) if re.compile('当天').findall(append_comment_date) != []: append_comment_date = comment_date elif add_days != []: append_comment_date = str(string_to_datetime(comment_date) + timedelta(days=int(add_days[0]))) else: raise ValueError('未知append_comment_date: {}'.format(append_comment_date)) else: pass return str(append_comment_date)
def run_forever(self): ''' 实时更新数据 :return: ''' result = self._get_db_old_data() if result is None: sleep_time = 20 print('获取db数据失败, 休眠{}s ...'.format(sleep_time)) sleep(sleep_time) return None index = 1 for item in result: # 实时更新数据 goods_id = item[0] pid = item[2] # 2020-04-12 00:00:00 pintuan_end_time = json_2_dict(item[1]).get('end_time') pintuan_end_time = datetime_to_timestamp( string_to_datetime(pintuan_end_time)) # print(pintuan_end_time) data = {} self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli, index=index, remainder=50) if self.sql_cli.is_connect_success: is_recent_time = self.is_recent_time(pintuan_end_time) if is_recent_time == 0: # 已恢复原价的 _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mia_update_str_7, sql_cli=self.sql_cli) print('该goods拼团开始时间为({})'.format( json.loads(item[1]).get('begin_time'))) sleep(.4) elif is_recent_time == 2: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) pass else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})' .format(goods_id, index)) data['goods_id'] = goods_id try: data_list = get_mia_pintuan_one_page_api_goods_info( page_num=pid) except ResponseBodyIsNullStrException: index += 1 sleep(.4) continue # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新 # try: # assert data_list != [], 'data_list不为空list!' # except AssertionError as e: # print(e) # _handle_goods_shelves_in_auto_goods_table( # goods_id=goods_id, # update_sql_str=mia_update_str_7, # sql_cli=self.sql_cli) # sleep(.4) # index += 1 # continue pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' mia_pt = MiaPintuanParse(is_real_times_update_call=True) if goods_id not in pintuan_goods_all_goods_id: # 内部已经下架的 # 一律更新 try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, ) except AssertionError: # 返回的data为空则跳过 index += 1 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table(data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == goods_id: sub_title = item_2.get('sub_title', '') try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, sub_title=sub_title, ) except AssertionError: # 返回的data为空则跳过 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table( data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass try: del mia_pt except: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
def _parse_page_from_wx(self, **kwargs): ''' 解析wx单个article的info :param kwargs: :return: a WellRecommendArticle object ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('data', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('user', {}).get('images', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = self.wash_sensitive_info(article_info.get('title', '')) # title默认留空 comment_content = self.wash_sensitive_info( article_info.get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('images_list', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] # wx端tags没有返回值 tags = self._get_tags_from_wx(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('video', '') tmp_video_url = re.compile('\?.*').sub('', tmp_video_url) video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('fav_count', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.lg.error('遇到错误:', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _