def __init__(self, user_id: str): self.image_height = config.THUMBNAIL_SIZE.HEIGHT self.image_width = config.THUMBNAIL_SIZE.WIDTH # process label self.label_set = '01' self.label_set_len = len(self.label_set) self.label_len = 1 self.label_size = self.label_set_len * self.label_len self.X = tf.compat.v1.placeholder(tf.float32, [None, HEIGHT * WIDTH]) # 特征向量 self.Y = tf.compat.v1.placeholder(tf.float32, [None, self.label_size]) # 标签 self.keep_prob = tf.compat.v1.placeholder(tf.float32) # dropout值 self.user_id = user_id self.stash = Stash('thumbnail_process') self.model_save_dir = self.pre_path( Path('model_save_dir', self.user_id)) self.model_name = self.model_save_dir + '/atv' self.model_log_dir = self.pre_path('model_logs', build=False) self.train_images_list = self.stash['train_data'] self.verify_images_list = self.stash['test_data'] self.log = AutoLog.file_log('model')
def test_vod(): stash = Stash(f'afreecatv_rlrlvkvk123') # vods = stash['rlrlvkvk123:vod'] station = 60946442 key = f'{station}:vodparam' vodparam = stash[key] pprint(get_url_params(vodparam))
def __init__(self, user_id: str): super().__init__(user_id) self.TRAIN_PATH = self.pre_path('train_data') self.TEST_PATH = self.pre_path('test_data') self.VALID_PATH = self.pre_path('valid_data') self.log = AutoLog.file_log('thumbnail_process') self.resize_width = config.THUMBNAIL_SIZE.WIDTH self.resize_height = config.THUMBNAIL_SIZE.HEIGHT self.stash = Stash('thumbnail_process') self.label_info = None
def __init__(self, bj_id: str): super().__init__() self.bj_id = bj_id self.stash = Stash(f'afreecatv_{self.bj_id}') # public url self.INFO_URL = 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php' self.THUMBNAIL_URL = 'http://videoimg.afreecatv.com/php/SnapshotLoad.php' self.VOD_URL_FORMAT = 'http://bjapi.afreecatv.com/api/%s/vods?page={page}' \ '&per_page=20&orderby=reg_date' % (self.bj_id,) self.USER_VOD_FORMAT = 'http://bjapi.afreecatv.com/api/%s/vods/user?page={page}' \ '&orderby=reg_date' % (self.bj_id,) self.STATION_URL = 'http://vod.afreecatv.com/PLAYER/STATION/{station_num}' # path self.STATION_PATH = str( Path(config.PROJECT_PATH, f'afreecatv_vod_thumbnail/{self.bj_id}')) # thumbnail 参数 self.thumbnailDuration = config.THUMBNAIL_SIZE.DURATION_SEC self.rowCount = config.THUMBNAIL_SIZE.ROW_COUNT self.columnCount = config.THUMBNAIL_SIZE.COLUMN_COUNT self.log = AutoLog.file_log('spider_thumbnail')
def run(self): address_list = [] with Stash(strategy_id) as stash: for account_info in conf.account_list: address = self._address(account_info, stash) log.info('{} [{}]'.format(account_info['account'], address)) driver.delete_all_cookies() if address: address_list.append(address) with open('{}.txt'.format(strategy_id), 'w') as f: f.write(','.join(address_list)) driver.close()
def __init__(self, bj_id: str): super().__init__() self.bj_id = bj_id self.stash = Stash(f'afreecatv_{self.bj_id}') self.log = AutoLog.file_log('spider_view_cnt') self.VIEW_CNT_URL = 'https://apisabana.afreecatv.com/service/vod_star2_stats.php' self.post_data = { 'szAction': 'view', 'nDeviceType': 1, 'nTitleNo': None, 'szLang': 'zh_CN', 'nStationNo': None, 'nBbsNo': None, 'szType': 'bj', 'szModule': 'BjViewCnt', 'szSysType': 'html5', 'szLoginId': self.account_id, 'nIdx': 1, } self.perfect_start_min = 5 self.smooth_factor = 2 self.range_factor = 4
def __init__(self, bj_id: str): self.bj_id = bj_id self.stash = Stash(f'afreecatv_{self.bj_id}') self.log = AutoLog.file_log('m3u8_merge') self.VOD_PATH = self.pre_path('vod')
class SnippetMerge: def __init__(self, bj_id: str): self.bj_id = bj_id self.stash = Stash(f'afreecatv_{self.bj_id}') self.log = AutoLog.file_log('m3u8_merge') self.VOD_PATH = self.pre_path('vod') def pre_path(self, dir_name: str) -> str: data_path = Path(config.DATA.DATA_PATH, self.bj_id, dir_name) if not data_path.exists(): data_path.mkdir(parents=True) return str(data_path) def video_key(self, station_num: str): return f'{station_num}:video_info' def retry(times=3): """ times == -1 forever :return: """ def deco(func): def new_handler(self, *args, **kwargs): retry_time = 1 if times == -1: while True: try: is_ok = func(self, *args, **kwargs) if is_ok: return is_ok self.log.error( f'retry[{retry_time}:{self.bj_id}:{func.__name__}]:{args, kwargs}' ) retry_time += 1 except Exception: self.log.error( f'retry[{retry_time}:{self.bj_id}:{func.__name__}]:{args, kwargs} \n' + util.error_msg()) elif times > 0: for i in range(times): try: is_ok = func(self, *args, **kwargs) if is_ok: return is_ok self.log.error( f'retry[{retry_time}:{self.bj_id}:{func.__name__}]:{args, kwargs}' ) retry_time += 1 except Exception: self.log.error( f'retry[{retry_time}:{self.bj_id}:{func.__name__}]:{args} \n' + util.error_msg()) self.log.error( f'Fail retry[{retry_time}:{self.bj_id}:{func.__name__}]:{args, kwargs}' ) return new_handler return deco @retry(times=7) def down(self, url: str, path: Path, param: typing.Dict = None, chunk_size: int = 1024 * 9, timeout: int = 8): path_dir = os.path.dirname(path) os.makedirs(path_dir, exist_ok=True) try: resp = requests.get(url, params=param, stream=True, timeout=timeout) if resp.status_code == 200: with open(str(path), 'wb') as f: for r in resp.iter_content(chunk_size=chunk_size): f.write(r) self.log.info( f'[{self.bj_id}:{str(path.parent).split("/")[-1]}] download {path.name} success' ) return True except (Timeout, ConnectionError): self.log.error(f'[TIMEOUT get]:{url}:{param}') return False except Exception: self.log.error( f'[{self.bj_id}:{str(path.parent).split("/")[-1]}] : {path.name}\n' + util.error_msg()) return False def _prepare_video(self, vod: typing.Dict) -> typing.Dict: video_info = vod['video'] result = {v['cum_duration']: v['url'] for v in video_info} return result def _parse_m3u8(self, vod: typing.Dict) -> typing.Dict: video_info = self._prepare_video(vod) def _m3u8(args: typing.Tuple[int, str]) -> typing.Dict: cum, url = args variant_m3u8 = m3u8.load(url) tmp = cum tar_video = {} if variant_m3u8.is_variant: bandwidth_uri = { p.stream_info.bandwidth: p.uri for p in variant_m3u8.playlists } # best_bandwidth = 2000000 if 2000000 in bandwidth_uri else min(list(bandwidth_uri.keys())) best_bandwidth = max(list(bandwidth_uri.keys())) bandwidth_uri = bandwidth_uri[best_bandwidth] tar_m3u8 = m3u8.load(bandwidth_uri) for s in tar_m3u8.segments: tmp += Decimal(str(s.duration)).quantize( Decimal('0'), rounding=ROUND_HALF_UP) tar_video[int(tmp)] = s.absolute_uri return tar_video group = Group() result = {} for tar_video in group.imap_unordered(_m3u8, video_info.items()): result.update(tar_video) return result def _ts2mp4(self, dirname: str, output_name: str = 'output'): """ https://ffmpeg.org/ffmpeg.html https://moejj.com/ffmpeghe-bing-shi-pin-wen-jian-guan-fang-wen-dang/ 从文件夹中获取所有对应后缀名文件列表,并按照序号排序 ffmpeg -i "concat:input1.ts|input2.ts|input3.ts" -c copy output.ts # this is a comment file '/path/to/file1' file '/path/to/file2' file '/path/to/file3' ffmpeg -f concat -safe 0 -i mylist.txt -c copy output """ output_mp4_name = f'{output_name}.mp4' compress_mp4_name = f'{output_name}_tmp.mp4' file_list = list(Path(dirname).glob('*.ts')) ordered_files = sorted( file_list, key=lambda x: (int(re.search(r'([0-9]+)(?=\.ts)', str(x))[0]), x)) if ordered_files: merge_path = os.path.abspath(Path(dirname, 'merge.txt')) output_path = os.path.abspath(Path(dirname, output_mp4_name)) compress_path = os.path.abspath(Path(dirname, compress_mp4_name)) with open(merge_path, 'w') as f: for i in ordered_files: if os.path.isfile(str(i)): f.write(f'file {i.name}\n') ffmpeg_cmd = f'ffmpeg -y -f concat -safe 0 -i {merge_path} -codec copy -bsf:a aac_adtstoasc {compress_path}' compress_cmd = f'ffmpeg -y -i {compress_path} -c:v libx264 -crf 28 {output_path}' try: self.log.info(f'start to merge {output_path}') subprocess.check_call(ffmpeg_cmd, shell=True) os.remove(merge_path) for i in file_list: i.unlink() # subprocess.check_call(compress_cmd, shell=True) # os.remove(compress_path) except subprocess.CalledProcessError as e: self.log.error(util.error_msg()) return self.log.info(f'merge {output_path} success') def _merge_m3u8_by_tar_time(self, station_num: str, vod: typing.Dict, tar_time_range: typing.List): path = Path(self.VOD_PATH, str(station_num)) os.makedirs(path, exist_ok=True) self.log.info(f'[{self.bj_id}:{station_num}] get vod m3u8 info') tar_video = self._parse_m3u8(vod) self.log.info( f'[{self.bj_id}:{station_num}] get vod m3u8 info success') pool = Pool(20) for t in tar_time_range: min_range, max_range = t min_d, max_d = Duration.set_time(min_range).to_duration( ), Duration.set_time(max_range).to_duration() for i in range(min_d, max_d + 1): if i in tar_video: ts_path = path.joinpath(f'{i}.ts') if os.path.isfile( ts_path) and ts_path.stat().st_size > 1024 * 500: continue pool.add( gevent.spawn(self.down, url=tar_video[i], path=ts_path)) pool.join() self.log.info(f'[{self.bj_id}:{station_num}] download ts success') self._ts2mp4(path, output_name=station_num) def _trans_set2result(self, vod_set: dict) -> typing.Dict: result = {} for station_num, vod_range in vod_set.items(): result[station_num] = [] for start, s_range in vod_range.items(): s = start.split('_')[-1] for min_r, max_r in s_range: min_r_str = (Duration.set_time(s) + duration_delta( s=min_r * config.THUMBNAIL_SIZE.DURATION_SEC)).to_str() max_r_str = (Duration.set_time(s) + duration_delta( s=max_r * config.THUMBNAIL_SIZE.DURATION_SEC)).to_str() result[station_num].append((min_r_str, max_r_str)) return result def _prepare_vod_cum(self, vod: typing.Dict): video_info = vod['video'] tmp = 0 for v in video_info: v.setdefault('cum_duration', tmp) tmp += v['duration'] def run(self, station_num: str, tar_time_range: typing.List, ignore=False): vod = self.stash.get(self.video_key(station_num)) self._prepare_vod_cum(vod) if not vod or vod['type'] != VOD_TYPE.VOD: return if ignore and Path.exists(Path(self.VOD_PATH, station_num)): return os.makedirs(Path(self.VOD_PATH, station_num), exist_ok=True) self._merge_m3u8_by_tar_time(station_num, vod, tar_time_range)
class ThumbnailSpider(AfreecaTV): """ 采集缩略图 """ def __init__(self, user_id: str): super().__init__() self.user_id = user_id self.stash = Stash(f'afreecatv_{self.user_id}') # public url self.INFO_URL = 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php' self.THUMBNAIL_URL = 'http://videoimg.afreecatv.com/php/SnapshotLoad.php' self.VOD_URL_FORMAT = 'http://bjapi.afreecatv.com/api/%s/vods?page={page}' \ '&per_page=20&orderby=reg_date' % (self.user_id,) self.USER_VOD_FORMAT = 'http://bjapi.afreecatv.com/api/%s/vods/user?page={page}' \ '&orderby=reg_date' % (self.user_id,) self.STATION_URL = 'http://vod.afreecatv.com/PLAYER/STATION/{station_num}' # path self.STATION_PATH = str( Path(config.PROJECT_PATH, f'afreecatv_vod_thumbnail/{self.user_id}')) # thumbnail 参数 self.thumbnailDuration = config.THUMBNAIL_SIZE.DURATION_SEC self.rowCount = config.THUMBNAIL_SIZE.ROW_COUNT self.columnCount = config.THUMBNAIL_SIZE.COLUMN_COUNT self.log = AutoLog.file_log('spider_thumbnail') def video_key(self, station_num): return f'{station_num}:video_info' def station_key(self, station_num): return f'{station_num}:vodparam' def add_bad_vod(self, station_num: int): """诡异的station""" self.log.error(f'[add_bad_vod]:{station_num}') bad_vod = self.stash.setdefault(VOD_TYPE.BAD, set()) bad_vod.add(station_num) self.stash[VOD_TYPE.BAD] = bad_vod def check_bad_vod(self, station_num: int) -> bool: """诡异的station""" bad_vod = self.stash.get(VOD_TYPE.BAD, set()) return station_num in bad_vod def _get_thumbnail_param(self, station_num: int, pos_time: float) -> dict: """ 缩略图按sec分成几份 找出pos_time 在那一份(row)里面 的哪一个column self.thumbnailDuration 每张图秒数 self.rowCount * self.columnCount 张图 为一column :param station_num: :param pos_time: :return: """ video_info = self.stash.get(self.video_key(station_num))['video'] if video_info: tmp = 0 for v in video_info: tmp += v['duration'] if tmp >= pos_time: row_key = v['key'] + '_t' # 在某一row里面起始时间 row_time = pos_time - (tmp - v['duration']) # 在第几张图 thumbnail_time = row_time // self.thumbnailDuration # 图在第几column column = thumbnail_time // (self.rowCount * self.columnCount) + 1 param = { 'rowKey': row_key, 'column': int(column), } return param def download_img(self, station_num: int, time: float, rewrite=False) -> bool: path = str( Path(self.STATION_PATH, str(station_num), f'{station_num}_{Duration.set_duration(time).to_str()}.jpg')) if not rewrite and os.path.exists(path): return True param = self._get_thumbnail_param(station_num, time) is_ok = self.down_img(self.THUMBNAIL_URL, param, path) if is_ok: self.log.info(f'[{station_num}:{time}] success') return is_ok def _get_video_info(self, station_num: int) -> dict: result = {} if self.check_bad_vod(station_num): return result if self.video_key(station_num) in self.stash and self.stash[ self.video_key(station_num)]['total'] > 0: return self.stash[self.video_key(station_num)] def _parse_vod_info(html): files = html.xpath('//file') duration = 0 video_info = [] for f in files: try: int(f.get('duration')) except: continue try: video_info.append({ 'key': f.get('key'), 'duration': int(f.get('duration')), 'url': f.text, 'cum_duration': duration, }) duration += int(f.get('duration')) except: self.log.error( f'[parse_video_info]:{self.video_key(station_num)}:{url} \n' + util.error_msg()) result = { 'video': video_info, 'total': duration, 'type': VOD_TYPE.VOD, } self.stash[self.video_key(station_num)] = result return result vod_param = self.station(station_num) if vod_param: url = self.INFO_URL + '?' + vod_param + '&szSysType=html5' resp = self.get(url) # TODO: user_vod if resp.status_code == 200: self.log.info(f'[get_video_info]:{station_num}') html = etree.HTML(resp.text) try: category = html.xpath('//video[@category]')[0].get( 'category') if category == '00210000': result = _parse_vod_info(html) except Exception: self.log.error( f'[get_video_info]:{self.video_key(station_num)}:{url} \n' + util.error_msg()) self.add_bad_vod(station_num) return result return result def station(self, station_num: int) -> str: station_key = self.station_key(station_num) if station_key in self.stash: return self.stash[station_key] resp = self.get(self.STATION_URL.format(station_num=station_num)) result = re.search(r'document.VodParameter = (.*?);', resp.text, re.S) if result: self.log.info(f'[station]:{station_num}') x = result.group(1).replace('\'', '') self.stash[station_key] = x return x def _get_vod(self, page: int, url: str, stash_key: str, append=True, is_check=False) -> typing.Optional[typing.Tuple[bool, dict]]: def parse_raw(raw: dict): self.log.info(f'[get_vod]:{url.format(page=page)}') if append: data_set = self.stash.get(stash_key, set()) else: data_set = set() for d in raw['data']: data_set.add(d['title_no']) self.stash[stash_key] = data_set return raw['meta'] resp = self.get(url.format(page=page)) try: raw = resp.json() except: return None if not is_check: return False, parse_raw(raw) else: vod_data = self.stash.get(stash_key, set()) if raw['meta']['total'] != len(vod_data): return False, parse_raw(raw) return True, vod_data def vod(self, url: str, stash_key: str) -> set: """ 获取vod_id :param url: :param key: :return: """ is_ok, meta = self._get_vod(1, url, stash_key, append=False, is_check=True) if not is_ok: pool = Pool(10) for i in range(2, meta['last_page'] + 1): pool.add(gevent.spawn(self._get_vod, i, url, stash_key)) pool.join() return self.stash.get(stash_key, set()) return meta def download_vod(self, station_num: int, rewrite=False): video_info = self._get_video_info(station_num) if not video_info: self.log.error(f'BAD VOD {station_num}') return total = video_info['total'] step = self.thumbnailDuration * self.rowCount * self.columnCount pool = Pool(10) for i in range(0, total, step): pool.add(gevent.spawn(self.download_img, station_num, i, rewrite)) pool.join() self.log.info(f'[{station_num}:vod] success') def test_img(self, img_name: str): station_num, h, m, s = re.search(r'(.*?)_(.*?):(.*?):(.*)\.jpg', img_name).groups() param = self._get_thumbnail_param( int(station_num), Duration.delta(int(h), int(m), int(s)).to_duration()) print(self.THUMBNAIL_URL + util.join_params(**param)) def test_download_img(self, station_num: int, t: float): param = self._get_thumbnail_param(int(station_num), t) print(self.THUMBNAIL_URL + util.join_params(**param)) def valid_thumbnail(self): dirs = os.listdir(self.STATION_PATH) dirs = set(dirs) - {'.DS_Store'} def del_bad_video_info(): vod = self.stash.get(VOD_TYPE.VOD, set()) | self.stash.get( VOD_TYPE.USER_VOD, set()) diff = vod - set(map(int, dirs)) for d in diff: del self.stash[self.video_key(d)] def del_bad_img(): for d in dirs: fd = os.path.join(self.STATION_PATH, d) for f in os.listdir(fd): fp = os.path.join(fd, f) size = os.path.getsize(fp) if size < 1024 * self.rowCount * self.columnCount: os.remove(fp) del_bad_img() del_bad_video_info() self.log.info('valid thumbnail success') def _init_spider(self, login=False): if login: self.login() self.get_session() def run(self, login=False): """ tv = ThumbnailSpider('rlrlvkvk123') tv.run(login=True) # print(tv.stash[VOD_TYPE.BAD]) # tv.test() # tv.test_img('36997061_0:0:0.jpg') # tv.test_download_img(43764953,3600) :return: """ self._init_spider(login) self.log.info('spider start') self.valid_thumbnail() vod = self.vod(self.VOD_URL_FORMAT, VOD_TYPE.VOD) self.log.info('prepare vod') # user_vod = self.vod(self.USER_VOD_FORMAT, VOD_TYPE.USER_VOD) self.log.info('prepare user vod') # vod = vod | user_vod # vod = user_vod self.log.info('prepare vod success') pool = Pool(4) for v in vod: pool.add(gevent.spawn(self.download_vod, v)) pool.join() self.log.info('spider end') def fix(self, station_num: int, rewrite=True, login: bool = False): self._init_spider(login) self.log.info(f'fix start [{self.user_id}:{station_num}]') self.download_vod(station_num=station_num, rewrite=rewrite)
from etc import config from pathlib import Path from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True class VOD_TYPE: VOD = 'vod' USER_VOD = 'user_vod' BAD = 'bad_vod' PIC_TYPE = ['.jpg', '.png', '.jpeg'] cookie_stash = Stash('afreecatv') class AfreecaTV: def __init__(self): self.LOGIN_URL = 'https://login.afreecatv.com/app/LoginAction.php' self.cookie_stash = cookie_stash self.log = None self.stash = None self.session = None # account self._init_account() # self.account_id = config.AfricaAccount.UID # self.account_pwd = config.AfricaAccount.PWD def _init_account(self):
def _strategy(): """ 瞄准买一卖一缝隙刷单 目标是尽量成交自己的单,持续时间尽量久 先卖后买 先买后卖 两种操作 交替执行 sell_buy buy_sell 单被吃 反向操作进行两次 Example: 进行buy_sell 导致买单被吃 之后做两次sell_buy 再buy_sell sell_buy 循环 保存操作状态 :return: """ spread = self.ticker[const.SIDE.ASK] - self.ticker[const.SIDE.BID] order_price = util.safe_decimal(round( float(self.ticker[const.SIDE.BID]) + random.uniform(float(self.price_point), float(spread - self.price_point)), util.get_round(self.price_point))) order_amount = util.safe_decimal(round( random.uniform(float(self.amount_point), float(self._get_amount_balance(const.SIDE.ASK))), util.get_round(self.amount_point))) with Stash(strategy_id) as stash: self._balance_asset(order_price, stash) if spread > self.price_point: # 有空隙 if self._judge_mode(stash) == STRATEGY_FLAG.FLAG_SB: is_ok = self.limit_sell(order_price, order_amount) if is_ok is None: stash[MODE_KEY] = MODE.FLAG_BS stash[BALANCE_KEY] = const.SIDE.ASK log.info('no suitable sell order') return if is_ok: pending_order = self.get_pending_order() self.limit_buy(pending_order[0]['price'], pending_order[0]['unsettled_amount']) if stash.get(MODE_KEY) == MODE.FILL_B: stash[MODE_KEY] = MODE.FLAG_SB else: stash[MODE_KEY] = MODE.FLAG_BS else: log.warn('sell order has filled') stash[MODE_KEY] = MODE.FILL_S else: is_ok = self.limit_buy(order_price, order_amount) if is_ok is None: stash[MODE_KEY] = MODE.FLAG_SB stash[BALANCE_KEY] = const.SIDE.BID log.info('no suitable buy order') return if is_ok: pending_order = self.get_pending_order() self.limit_sell(pending_order[0]['price'], pending_order[0]['unsettled_amount']) if stash.get(MODE_KEY) == MODE.FILL_S: stash[MODE_KEY] = MODE.FLAG_BS else: stash[MODE_KEY] = MODE.FLAG_SB else: log.warn('buy order has filled') stash[MODE_KEY] = MODE.FILL_B else: wait_spread = random.uniform(1, 10) msg = 'BID ASk price too close sleep {} sec'.format(wait_spread) log.info(msg) print(msg) sleep(wait_spread)
#!/usr/bin/env python # -*- coding: utf-8 -*- """ @author = 'wyx' @time = 2018/8/10 11:43 @annotation = '' """ import datetime import time from base import util from base.stash import Stash with Stash('abcc_eth_btc') as stash: print(stash['mode']) # dt = util.ts2dt(time.time()) # print(dt) # ts = util.dt2ts(dt) # print(ts) # # print(((util.nowdt()-(util.nowdt()-datetime.timedelta(days=1)))).days) # # # print('{:.4f}'.format(util.safe_decimal(12.99)))
class ViewCnt(AfreecaTV): def __init__(self, bj_id: str): super().__init__() self.bj_id = bj_id self.stash = Stash(f'afreecatv_{self.bj_id}') self.log = AutoLog.file_log('spider_view_cnt') self.VIEW_CNT_URL = 'https://apisabana.afreecatv.com/service/vod_star2_stats.php' self.post_data = { 'szAction': 'view', 'nDeviceType': 1, 'nTitleNo': None, 'szLang': 'zh_CN', 'nStationNo': None, 'nBbsNo': None, 'szType': 'bj', 'szModule': 'BjViewCnt', 'szSysType': 'html5', 'szLoginId': self.account_id, 'nIdx': 1, } self.perfect_start_min = 5 self.smooth_factor = 2 self.range_factor = 4 def _init_spider(self, login=False): if login: self.login() self.get_session() def run(self) -> dict: self._init_spider(False) self.log.info('spider start') target_dict = {} target = None vods = self.stash.get(self.global_key(VOD_TYPE.VOD)) if vods: for station_num in vods: vod_info = self.stash.get(self.video_key(station_num)) time_duration = vod_info['total'] if self.view_cnt_key(station_num) in self.stash: target = self._find_top( self.stash[self.view_cnt_key(station_num)], time_duration) else: vodparam = self.stash.get(self.station_key(station_num)) if vodparam: vod_param = util.get_url_params(vodparam) self.post_data['nTitleNo'] = vod_param['nTitleNo'] self.post_data['nStationNo'] = vod_param['nStationNo'] self.post_data['nBbsNo'] = vod_param['nBbsNo'] resp = self.post(self.VIEW_CNT_URL, self.post_data) if resp: raw_data = resp.json() if raw_data['result'] == 1: cnt_data = raw_data['data']['cnt'] self.stash[self.view_cnt_key( station_num)] = cnt_data target = self._find_top( cnt_data, time_duration) if target: target_dict[station_num] = target return target_dict def _find_top(self, raw_data: typing.List, time_duration: int) -> typing.Optional[typing.List]: if not raw_data or not time_duration: return cnt = pd.DataFrame(raw_data, columns=['index', 'value']) perfect_duration = ( Duration.set_duration(time_duration) - duration_delta(m=self.perfect_start_min)).to_duration() per_index = time_duration // len(cnt) diff_duration = time_duration - perfect_duration perfect_start = diff_duration // per_index y = pd.DataFrame(raw_data[perfect_start:], columns=['index', 'value']) sma_period = perfect_start * self.smooth_factor Y = ta.SMA(y['value'].values.astype('float64'), timeperiod=sma_period).tolist() top = [] for i, d in enumerate(Y): if d > 0 and i < len(Y) - 1: if (Y[i - 1] <= d and d >= Y[i + 1]) or (i == 0 and d >= Y[i + 1]): top.append((i + perfect_start, d)) def row_sma(row): cond = (row['start_index'] <= cnt['index']) & (cnt['index'] <= row['index']) max_id = cnt.where(cond).dropna()['value'].idxmax() result = cnt.loc[max_id] row['ori_index'] = result['index'] row['ori_value'] = result['value'] ori_duration = result['index'] * per_index start_duration = Duration.set_duration(ori_duration - MIN_SEC * self.range_factor).to_str() end_duration = Duration.set_duration(ori_duration + MIN_SEC * self.range_factor).to_str() row['ori_range_duration'] = (start_duration, end_duration) return row top_df = pd.DataFrame(top, columns=['index', 'value']) top_df['value'].where(top_df['value'] > top_df['value'].mean(), inplace=True) top_df = top_df.where(top_df['value'] > 0).dropna() top_df['start_index'] = top_df['index'] - sma_period + 1 top_df['index'] = top_df['index'] target = top_df.apply(row_sma, axis=1) return target['ori_range_duration'].drop_duplicates().to_list()