def store_user_video(cls, mid, data, session=None, csvwriter=None): """ mid,data 为生成的queue的里获取的数据,data参数多余为了兼容store_video session: None:csvwriter not None::ORM""" info = cls.getUserInfo(mid) if info: new_user = BiliUserInfo(**dict(zip(cls.user_field_keys, info))) video_infos = cls.getVideoList(mid) new_videos = None if video_infos: new_videos = (BiliVideoSimpleInfo( **dict(zip(cls.video_field_keys, vinfo))) for vinfo in video_infos) if session: DBOperation.add(new_user, session) if new_videos: DBOperation.add_all(new_videos, session) return True elif csvwriter: csvwriter[0].writerow(info) if video_infos: for video_info in video_infos: csvwriter[1].writerow(video_info) return True else: print(info) return True else: return False
def init_via_tid(tid): bapi = BiliApi() session = Session() # get page total obj = bapi.get_archive_rank_by_partion(tid, 1, 50) page_total = math.ceil(obj['data']['page']['count'] / 50) logger_01.info('%d page(s) found.' % page_total) # get videos data info from api page_num = 1 last_aid_list = [] last_create_ts = 0 last_create_ts_offset = 59 while page_num <= page_total: obj = bapi.get_archive_rank_by_partion(tid, page_num, 50) while True: try: for _ in obj['data']['archives']: pass break except TypeError: logger_01.warning('TypeError caught, re-call page_num = %d' % page_num) time.sleep(1) obj = bapi.get_archive_rank_by_partion(tid, page_num, 50) try: aid_list = [] video_list = [] for arch in obj['data']['archives']: aid = int(arch['aid']) create = arch['create'] if aid not in last_aid_list: # manual reset create_ts create_ts = create_time_to_ts(create) if create_ts == last_create_ts: if last_create_ts_offset > 0: last_create_ts_offset -= 1 else: last_create_ts = create_ts last_create_ts_offset = 59 create_ts += last_create_ts_offset video_list.append(Video(aid=aid, tid=tid, create=create_ts)) aid_list.append(aid) else: logger_01.warning('Aid %d already added!' % aid) DBOperation.add_all(video_list, session) last_aid_list = aid_list page_total = int(obj['data']['page']['count'] / 50) + 1 logger_01.info('Page %d / %d done.' % (page_num, page_total)) except Exception as e: logger_01.error('Exception caught. Detail: %s' % e) page_num += 1 session.close() logger_01.info('Success get %d tid videos data info from api!' % tid)
def store_video(cls, aid, session=None, csvwriter=None): """session, csvwriter 二选一都没有直接打印""" info = cls.getVideoInfo(aid) if info: new_video = BiliVideoInfo(**dict(zip(cls.field_keys, info))) if session: DBOperation.add(new_video, session) return True elif csvwriter: csvwriter.writerow(info) return True else: print(info) return True else: return False
def get_tid_pn(aid, session): # query video video = DBOperation.query_video_via_aid(aid, session) if video is None: print('Video aid=%d not found!' % aid) return None # query count tid = video.tid create = video.create # count_total = DBOperation.count_video_via_tid(tid, session) count_later = DBOperation.count_later_video_via_tid_and_create( tid, create, session) if count_later is None: print('Fail to count later video!') return None pn = math.ceil(count_later / 50) return tid, pn
def store_user(cls, mid, data, session=None, csvwriter=None): """ mid,data 为生成的queue的里获取的数据,data参数多余为了兼容store_video session: None:csvwriter not None::ORM""" info = cls.getUserInfo(mid) if info: new_user = BiliUserInfo(**dict(zip(cls.field_keys, info))) if session: DBOperation.add(new_user, session) return True elif csvwriter: csvwriter.writerow(info) return True else: print(info) return True else: return False
def store_video(cls, aid, session=None, csvwriter=None): """session, csvwriter 二选一都没有直接打印""" info = cls.getVideoInfo(aid) #print(info) if info: new_video = TddFocusVideoRecord(**dict(zip(cls.field_keys, info))) if session: print("update av%s with %d views at %s" % (info[1], info[2], time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info[0])))) DBOperation.add(new_video, session) return True elif csvwriter: csvwriter.writerow(info) return True else: print(info) return True else: return False
def store_video_simpleajax(cls, mid, aid, session=None, csvwriter=None): """session, csvwriter 二选一都没有直接打印 只保存mid,aid 和ajax信息""" info_ajax = cls(aid).getAjaxInfo() try: info = (mid, aid) + info_ajax except: info = None if info: new_video = BiliVideoAjaxInfo( **dict(zip(cls.field_keys_ajax, info))) if session: DBOperation.add(new_video, session) return True elif csvwriter: csvwriter.writerow(info) return True else: print(info) return True else: return False
def get_update_aids(): result = [] items = DBOperation.query(TddFocusVideo, Session()) for item in items: result.append(item.aid) return result
def routine_update_via_tid(tid): global is_updating logger_02.info('Now start routine update %d tid...' % tid) if is_updating: logger_02.warning('Last round has not finished, stop this round.') return else: is_updating = True session = Session() bapi = BiliApi() # 01 add new video logger_02.info('Now start add new video with tid %d...' % tid) # get last aid last_aids = list( map(lambda x: x.aid, DBOperation.query_last_x_aid_via_tid(tid, 10, session))) logger_02.info('Get last aids: %s' % last_aids) # avoid last aid deleted # get page total obj = bapi.get_archive_rank_by_partion(tid, 1, 50) page_total = math.ceil(obj['data']['page']['count'] / 50) # add new videos data info from api page_num = 1 last_aid_list = [] last_create_ts = 0 last_create_ts_offset = 59 goon = True new_video_count = 0 while page_num <= page_total and goon: obj = bapi.get_archive_rank_by_partion(tid, page_num, 50) while True: try: for _ in obj['data']['archives']: pass break except TypeError: logger_02.warning('TypeError caught, re-call page_num = %d' % page_num) time.sleep(1) obj = bapi.get_archive_rank_by_partion(tid, page_num, 50) try: aid_list = [] video_list = [] for arch in obj['data']['archives']: aid = int(arch['aid']) create = arch['create'] if aid in last_aids: logger_02.info('Meet aid = %d in last_aids, break.' % aid) goon = False break if aid not in last_aid_list: # manual reset create_ts create_ts = create_time_to_ts(create) if create_ts == last_create_ts: if last_create_ts_offset > 0: last_create_ts_offset -= 1 else: last_create_ts = create_ts last_create_ts_offset = 59 create_ts += last_create_ts_offset video = Video(aid=aid, tid=tid, create=create_ts) new_video_count += 1 logger_02.info('Add new video %s' % video) video_list.append(video) aid_list.append(aid) else: logger_02.warning('Aid %d already added!' % aid) DBOperation.add_all(video_list, session) last_aid_list = aid_list page_total = math.ceil(obj['data']['page']['count'] / 50) # logger_02.info('%d / %d done' % (page_num, page_total)) except Exception as e: logger_02.error('Exception caught. Detail: %s' % e) page_num += 1 if new_video_count == 0: logger_02.info('No new video found with %d tid.' % tid) else: logger_02.info('%d new video(s) found with %d tid.' % (new_video_count, tid)) logger_02.info('Finish add new video with tid %d!' % tid) # 02 delete invalid video logger_02.info('Now start delete invalid video with tid %d...' % tid) # get count in db count_db = DBOperation.count_video_via_tid(tid, session) # get count via api obj = bapi.get_archive_rank_by_partion(tid, 1, 50) count_api = int(obj['data']['page']['count']) page_total = math.ceil(obj['data']['page']['count'] / 50) logger_02.info('Get count_db = %d, count_api = %d' % (count_db, count_api)) invalid_count = count_db - count_api if invalid_count > 0: # need to delete page_num = 1 unsettled_diff_aids = [] while page_num <= page_total and invalid_count > 0: obj = bapi.get_archive_rank_by_partion(tid, page_num, 50) while True: try: for _ in obj['data']['archives']: pass break except TypeError: logger_02.warning( 'TypeError caught, re-call page_num = %d' % page_num) time.sleep(1) obj = bapi.get_archive_rank_by_partion(tid, page_num, 50) try: # get page aids page_aids = [v['aid'] for v in obj['data']['archives']] # get db aids create_ts_from = create_time_to_ts( obj['data']['archives'][0]['create']) + 59 # bigger one create_ts_to = create_time_to_ts( obj['data']['archives'][-1]['create']) # smaller one db_videos = DBOperation.query_video_between_create_ts( create_ts_from, create_ts_to, session) db_aids = list(map(lambda x: x.aid, db_videos)) # process unsettled for aid in unsettled_diff_aids: if aid not in page_aids: # query create time create = -1 if aid in db_aids: for v in db_videos: if v.aid == aid: create = v.create break else: logger_02.warning( 'Cannot find unsettled diff aid %d in db_aids %s!' % (aid, db_aids)) if create_ts_to <= create <= create_ts_to + 59: # maybe in next page logger_02.info('Remain aid %d in unsettled list.' % aid) else: DBOperation.delete_video_via_aid(aid, session) logger_02.info('Delete unsettled invalid aid %d.' % aid) if is_aid_valid(aid): logger_02.warning( 'Aid %d is not invalid! Do not remove it.' % aid) else: unsettled_diff_aids.remove(aid) invalid_count -= 1 else: logger_02.info('Save unsettled aid %d.' % aid) unsettled_diff_aids.remove(aid) # get diff diff_aids = [aid for aid in db_aids if aid not in page_aids] new_aids = [aid for aid in page_aids if aid not in db_aids] # process diff if len(diff_aids) > 0: for aid in diff_aids: # query create time create = -1 for v in db_videos: if v.aid == aid: create = v.create break if create_ts_to <= create <= create_ts_to + 59: unsettled_diff_aids.append(aid) logger_02.info('Add aid %d to unsettled list.' % aid) elif create_ts_from - 59 <= create <= create_ts_from: # counted in last page pass else: logger_02.info('Delete invalid aid %d.' % aid) if is_aid_valid(aid): logger_02.warning( 'Aid %d is not invalid! Do not remove it.' % aid) else: DBOperation.delete_video_via_aid(aid, session) invalid_count -= 1 else: logger_02.info('No diff aid!') # process new last_create_ts = 0 last_create_ts_offset = 59 for aid in new_aids: for arch in obj['data']['archives']: if arch['aid'] == aid: create = arch['create'] create_ts = create_time_to_ts(create) if create_ts == last_create_ts: if last_create_ts_offset > 0: last_create_ts_offset -= 1 else: last_create_ts = create_ts last_create_ts_offset = 59 create_ts += last_create_ts_offset video = Video(aid=aid, tid=tid, create=create_ts) logger_02.warning( 'Add new video %s during finding invalid aid.' % video) DBOperation.add(video, session) break page_total = math.ceil(obj['data']['page']['count'] / 50) logger_02.info('Page %d / %d done, %d invalid aid left.' % (page_num, page_total, invalid_count)) page_num += 1 except Exception as e: logger_02.error('Exception caught. Detail: %s' % e) else: logger_02.info('No invalid video to delete!') logger_02.info('Finish delete invalid video with tid %d!' % tid) logger_02.info('Finish routine update %d tid.\n' % tid) session.close() is_updating = False
def main(): round_count = 1 round_start = 0 round_end = 0 round_visit_count = 0 session = None while True: try: logger_11.info('round %d start' % round_count) round_start = get_ts_s() round_visit_count = 0 bapi = BiliApi() session = Session() # get page total obj = bapi.get_archive_rank_by_partion(30, 1, 50) page_total = math.ceil(obj['data']['page']['count'] / 50) logger_11.info('%d page(s) found' % page_total) page_num = 1 while page_num <= page_total: obj = bapi.get_archive_rank_by_partion(30, page_num, 50) while True: try: for _ in obj['data']['archives']: pass break except TypeError: logger_11.warning( 'TypeError caught, re-call page_num = %d' % page_num) time.sleep(1) obj = bapi.get_archive_rank_by_partion( 30, page_num, 50) try: added = get_ts_s() for arch in obj['data']['archives']: aid = int(arch['aid']) nbph_record = DBOperation.query_nbph_record_via_aid( aid, session) if nbph_record: if nbph_record.pn != page_num: nbph_record.pn = page_num nbph_record.added = added session.commit() else: nbph_record = NbphRecord() nbph_record.aid = aid nbph_record.pn = page_num nbph_record.added = added DBOperation.add(nbph_record, session) round_visit_count += 1 except Exception as e: logger_11.error('Exception caught. Detail: %s' % e) page_num += 1 time.sleep(0.1) except Exception as e: logger_11.error(e) finally: session.close() round_end = get_ts_s() logger_11.info( 'round %d, start: %s, end: %s, timespan: %d, visit_count: %d, speed: %.2f' % (round_count, ts_s_to_str(round_start), ts_s_to_str(round_end), round_end - round_start, round_visit_count, round_visit_count / (round_end - round_start) * 60)) round_count += 1 time.sleep(10)