def test_thread_page_reverse_walker_with_login(self): if SimpleTest.user_hash == None: self.skipTest(reason="需要登录") client = self.new_client() (page100, _) = client.get_thread_page(29184693, page=100) gatekeeper_post_id = list(page100.replies)[-1].id page_count = 0 for (n, page, _) in create_walker( target=ReversalThreadWalkTarget( thread_id=29184693, gatekeeper_post_id=gatekeeper_post_id, start_page_number=101, stop_before_post_id=gatekeeper_post_id, expected_stop_page_number=100, ), client=client, options={ "user_cookie": self.user_cookie, }, ): self.assertTrue(n in [100, 101]) page_count += 1 if n == 100: self.assertEqual(len(page.replies), 0) self.assertEqual(page_count, 2)
def rescan_board(args: argparse.Namespace, db: DB, client: anobbsclient.Client, stats: Stats): # 用于检测当天消失的串, # 但如果当天消失的串最后上浮的时间在当天之前,就无法检测到了 thread_ids_seen_today = set(db.get_thread_ids_seen_since(args.since)) walker = create_walker( target=BoardWalkTarget( start_page_number=1, board_id=args.board_id, stop_before_datetime=args.since, ), client=client, ) for (_, page, usage) in walker: page: List[anobbsclient.BoardThread] = page now = datetime.now(tz=local_tz) stats.board_request_count += 1 stats.total_bandwidth_usage.add(usage) for thread in page: thread_ids_seen_today.discard(thread.id) db.record_thread(thread, board_id=args.board_id, updated_at=now) db.report_is_thread_disappeared(thread.id, now, False) for not_found_thread_id in thread_ids_seen_today: # 只若先前没有发现消失,才会对此更新 if not db.is_thread_disappeared(not_found_thread_id): logging.info(f"发现 {not_found_thread_id} 消失") db.report_is_thread_disappeared(not_found_thread_id, now, True)
def find_last_post_with_uuid( client: anobbsclient.Client, thread_id: int) -> Optional[Tuple[int, int, str, int]]: """ Returns ------- [0] : int 报告的页数。 不是回应所在的页数 [1] : int 目标回应的串号。 [2] : str 找到的 UUID。 [3] : int 目标回应的偏移。 如果没找到或者找到的第一个 uuid 不匹配,返回 None。 """ # TODO: 可以根据上一次回应所在位置预测一下,大部分情况能把请求减少到1次 # TODO: 如果发现串 SAGE 了,以后就不发了,或者提前检查一下有无 SAGE? (page_1, _) = client.get_thread_page(id=thread_id, page=1, for_analysis=1) page_1: anobbsclient.ThreadPage = page_1 # TODO: 其实这个可以在 API 那边定义 property 来算吧 total_pages = (page_1.body.total_reply_count - 1) // 19 + 1 walker = create_walker( target=ReversalThreadWalkTarget( thread_id=thread_id, gatekeeper_post_id=None, start_page_number=total_pages, ), client=client, ) for (pn, page, _) in walker: page: anobbsclient.ThreadPage = page for (i, post) in enumerate(reversed(page.replies)): text = BeautifulSoup(post.content).text uuid_rx = re.compile( r"(?:.*\n)+" + META_MAIN_DIVIDER + r"\n" + r"(?:.*\n)+Report ID = ([0-9a-f\-]+).*(?:\n.*)*", re.MULTILINE, ) result = uuid_rx.match(text) if result is None: continue uuid = result.group(1) report_pn = int(re.match(r"页 ❬(\d+) / \d+❭", post.name).group(1)) return (report_pn, post.id, uuid, (pn - 1) * 19 + 1 + (len(page.replies) - 1 - i)) return None
def case_no_login(): for (_, _, _) in create_walker( target=ReversalThreadWalkTarget( thread_id=29184693, gatekeeper_post_id=99999999, start_page_number=101, ), client=client, ): assert(False)
def test_thread_page_reverse_walker(self): client = self.new_client() walker = create_walker( target=ReversalThreadWalkTarget( thread_id=29184693, gatekeeper_post_id=99999999, start_page_number=3, ), client=client, ) page_count, last_page_max_post_id = 0, None for (n, page, _) in walker: self.assertTrue(n in [1, 2, 3]) page_count += 1 if n == 3: last_page_max_post_id = page.replies[-1].id self.assertEqual(page_count, 3) walker = create_walker( target=ReversalThreadWalkTarget( thread_id=29184693, gatekeeper_post_id=99999999, start_page_number=4, stop_before_post_id=last_page_max_post_id, expected_stop_page_number=3, ), client=client, ) page_count = 0 for (n, page, _) in walker: self.assertTrue(n in [3, 4]) page_count += 1 if n == 3: self.assertEqual(len(page.replies), 0) self.assertEqual(page_count, 2)
def case_gatekept(): for (_, _, _) in create_walker( target=ReversalThreadWalkTarget( thread_id=29184693, gatekeeper_post_id=gatekeeper_post_id, start_page_number=101, ), client=client, options={ "user_cookie": anobbsclient.UserCookie( userhash="", # 无效的饼干 ), }, ): assert(False)
def test_board_page_walker(self): client = self.new_client() now = datetime.now(local_tz) two_hours_ago = now - timedelta(hours=2) for (_, page, _) in create_walker( target=BoardWalkTarget( board_id=111, start_page_number=1, stop_before_datetime=two_hours_ago, ), client=client, ): page: anobbsclient.BoardThread = page for thread in page: self.assertGreaterEqual( thread.last_modified_time, two_hours_ago)
def test_thread_page_reverse_walker_stop_before_datetime(self): client = self.new_client() page_count = 0 for (n, page, _) in create_walker( target=ReversalThreadWalkTarget( thread_id=29184693, gatekeeper_post_id=99999999, start_page_number=98, stop_before_datetime=datetime( # 2020-08-09(日)22:00:21 year=2020, month=8, day=9, hour=22, minute=00, second=21, tzinfo=local_tz, ) ), client=client, ): page: anobbsclient.ThreadPage = page self.assertTrue(n in [96, 97, 98]) page_count += 1 if n == 96: self.assertEqual(page.replies[0].id, 29279607) self.assertEqual(page_count, 3)
def fetch_board(db: DB, activity: Activity, client: anobbsclient.Client, board_id: int, fetching_since: datetime, stats: Stats): logger = logging.getLogger('FETCH') walker = create_walker( target=BoardWalkTarget( board_id=board_id, start_page_number=1, stop_before_datetime=fetching_since, ), client=client, ) is_first_found_thread = True threads_on_board: List[anobbsclient.BoardThread] = [] bandwidth_usage_for_board = TotalBandwidthUsage() for (pn, page, usage) in walker: logger.info(f'获取到版块第 {pn} 页。纳入串数 = {len(page)}') bandwidth_usage_for_board.add(usage) stats.board_request_count += 1 threads_on_board += page stats.total_bandwidth_usage.add(bandwidth_usage_for_board.total) logger.info(f'完成获取版块。总共纳入串数 = {len(threads_on_board)},' + f'期间 (上传字节数, 下载字节数) = {bandwidth_usage_for_board.total}') now = datetime.now(tz=local_tz) for (i, thread) in enumerate(threads_on_board): logger.debug(f'串 #{i}。串号 = {thread.id},' + f'最后修改时间 = {thread.last_modified_time}') if is_first_found_thread: is_first_found_thread = False activity.report_collecting_range( since=fetching_since, until=thread.last_modified_time) is_thread_recorded = db.is_thread_recorded(thread.id) if not is_thread_recorded: stats.new_thread_count += 1 # 记录或更新串 # current_reply_count 在后面一同记录 db.record_thread(thread, board_id=board_id, updated_at=now) if len(thread.replies) == 0: assert(thread.total_reply_count == 0) logger.debug(f'串 #{i} 暂无回应,到此结束') continue # 根据数据库中是否已存在该串之前抓取到的回应, # 来决定如何判断某回应是否是抓取目标 latest_seen_reply_id = \ db.try_find_thread_latest_seen_reply_id(thread_id=thread.id) has_old_records = latest_seen_reply_id is not None if has_old_records: def is_target(x): return x.id > latest_seen_reply_id logger.debug(f'串 #{i} 是之前已经抓取过的串,' + f'将会通过之前抓取到的最大串号作为范围的下界') else: def is_target(x): return x.created_at >= fetching_since logger.debug(f'串 #{i} 是之前曾未抓取过的串,' + f'将会通过规定的下界时间作为范围的下界') new_responses_in_preview = list( [post for post in thread.replies if is_target(post)]) if thread.total_reply_count <= 5 \ or not is_target(thread.replies[0]): # 要抓取的内容全在预览里,不用再进串里去翻了 # TODO 判断是否没有剩余回应(len(thread.total_reply_count) <= 5)应该在 API 那边进行 if len(new_responses_in_preview) > 0: if is_thread_recorded: stats.affected_thread_count += 1 stats.new_post_count += len(new_responses_in_preview) db.record_thread_replies(thread=thread, replies=new_responses_in_preview, total_reply_count=thread.total_reply_count, updated_at=now) logger.debug(f'串 #{i} 由于全部需要抓取的回应已在预览之中,记录后到此结束。') else: # 反向遍历 start_page_number = (thread.total_reply_count - 1) // 19 + 1 logger.debug(f'串 #{i} 需要进入以抓取目标范围内的回应。' + f'从回应总数推测出的当前页数 = {start_page_number}') if (thread.total_reply_count % 19) <= 5: # 最新一页的内容已经全部包含在预览中了,因此略过 logger.debug(f'串 #{i} 由于最新一页的回应已全部包含在预览中,抓取时会略过该页') start_page_number -= 1 needs_gatekeeper_post_id = False if has_old_records: last_reply_count = \ db.get_thread_total_reply_count(thread_id=thread.id) if last_reply_count is not None: last_page_count = (last_reply_count - 1) // 19 + 1 else: last_page_count = None logger.warning(f'串 #{i} 存在曾抓取到的回应,但却没有记录回应总数') if (last_page_count is None or not client.thread_page_requires_login(last_page_count)) \ and client.thread_page_requires_login(start_page_number): needs_gatekeeper_post_id = True logger.debug(f'串 #{i} 由于要抓取的内容需要登录,' + f'而之前抓取到的内容在需要登录之前,无法用以判断是否卡页,' + f'因而需要额外获取第 100 页来确认守门串号') elif client.thread_page_requires_login(start_page_number): needs_gatekeeper_post_id = True logger.debug(f'串 #{i} 由于要抓取的内容需要登录,' + f'而之前曾未抓取过内容,无法用以判断是否卡页,' + f'因而需要额外获取第 100 页来确认守门串号') if needs_gatekeeper_post_id: # TODO: 这一块应该放在 API 那边 (gatekeeper_page, usage) = client.get_thread_page( id=thread.id, page=client.get_thread_gatekeeper_page_number()) stats.total_bandwidth_usage.add(usage) stats.thread_request_count += 1 gatekeeper_post_id = gatekeeper_page.replies[-1].id logger.debug(f'串 #{i} 确认守门串号。守门串号 = {gatekeeper_post_id}') else: gatekeeper_post_id = None if has_old_records: walker = create_walker( target=ReversalThreadWalkTarget( thread_id=thread.id, start_page_number=start_page_number, gatekeeper_post_id=gatekeeper_post_id, stop_before_post_id=latest_seen_reply_id, expected_stop_page_number=last_page_count, ), client=client, ) else: walker = create_walker( target=ReversalThreadWalkTarget( thread_id=thread.id, start_page_number=start_page_number, gatekeeper_post_id=gatekeeper_post_id, stop_before_datetime=fetching_since, ), client=client, ) final_reply_count = None targets = [] bandwidth_usage_for_thread = TotalBandwidthUsage() thread_walk_page_count = 0 for (pn, page, usage) in walker: thread_walk_page_count += 1 stats.thread_request_count += 1 if client.thread_page_requires_login(pn): stats.logged_in_thread_request_count += 1 logger.debug(f'串 #{i} 页 {pn}。纳入回应数 = {len(page.replies)}') page: anobbsclient.ThreadPage = page bandwidth_usage_for_thread.add(usage) if final_reply_count is None: final_reply_count = page.body.total_reply_count targets += page.replies targets += new_responses_in_preview now_after_fetching_inside_thread = datetime.now(tz=local_tz) db.record_thread_replies(thread=thread, replies=targets, total_reply_count=final_reply_count, updated_at=now_after_fetching_inside_thread) stats.total_bandwidth_usage.add(bandwidth_usage_for_thread.total) if len(targets) > 0: if is_thread_recorded: stats.affected_thread_count += 1 stats.new_post_count += len(targets) logger.debug(f'串 #{i} 已抓取到范围内所有新回应,记录后到此结束。' + f'遍历访问页数 = {thread_walk_page_count},' + f'期间 (上传字节数, 下载字节数) = {bandwidth_usage_for_thread.total}')
def fetch_page_range_back_to_front( client: anobbsclient.Client, thread_id: int, from_upper_bound_page_number: int, to_lower_bound_page_number: int, lower_bound_post_id: int, gatekeeper_post_id: Optional[int] ) -> Tuple[List[Page], Optional[int], bool]: """ Returns ------- Optional[List[Page]] 获取到的页面的列表。 如果为空,代表本轮结果被抛弃。 Optional[int] 本轮见到的最大的串号。 bool 是否应该中断。 如果为真,则处理完本轮后应该终止程序。 """ # 过程是否被中断。 # 如果为真,页数最小的那页会是 `.previous-page-unchecked` aborted = False # 是否应该抛弃已经获取到的各页,以防止损害已有数据 should_abandon = False pages: List[Page] = [] try: for (n, page, _) in create_walker( target=ReversalThreadWalkTarget( thread_id=thread_id, start_page_number=from_upper_bound_page_number, gatekeeper_post_id=gatekeeper_post_id, stop_before_post_id=lower_bound_post_id, expected_stop_page_number=to_lower_bound_page_number, ), client=client, ): msg = f"范围:从第{from_upper_bound_page_number}页至第{to_lower_bound_page_number}页," msg += f"获取处理:第{n}页" if n < to_lower_bound_page_number: msg += f"(将合并至第{to_lower_bound_page_number}页)" logging.info(msg) if len(pages) == 0 or pages[ -1].page_number != to_lower_bound_page_number: pages.append( Page( thread_body=page.body, page_number=n, replies=page.replies, )) else: pages[-1].replies.extend(page.replies) logging.info(f"获取完成:第{n}页") except KeyboardInterrupt: logging.warning("收到用户键盘中断,将中断") aborted = True except anobbsclient.RequiresLoginException: logging.error("未登陆,将中断") aborted, should_abandon = True, True except anobbsclient.GatekeptException as e: logging.error( f"出现「卡99」现象,疑似登陆失效,将中断。当前页面页数:{e.current_page_number},上下文:{e.context},守门串号:{e.gatekeeper_post_id}" ) aborted, should_abandon = True, True except anobbsclient.UnreachableLowerBoundPostIDException as e: logging.error(f"由于不明原因,无法到达预定的下界串号,将中断。下界串号: {e.lower_bound_post_id}") aborted, should_abandon = True, True except anobbsclient.UnexpectedLowerBoundPostIDException as e: logging.error( f"在预期之外的大于页数下界的页面遇到了下界串号 {e.lower_bound_post_id},当前页面页数:{e.current_page_number},页数下界:{e.expected_lower_bound_page_number}" ) aborted, should_abandon = True, True if should_abandon: logging.error("将遗弃已获取的页面") pages = None # 本轮见过的最大的串号。 # 由于外层每一轮是从前向后的顺序进行处理, # 当本轮页数超过守门页时,可以让下一轮有效检测「卡99」 if pages != None: if len(pages[0].replies) != 0: current_round_max_seen_post_id = int(pages[0].replies[-1].id) elif len(pages) > 1: current_round_max_seen_post_id = int(pages[1].replies[-1].id) else: current_round_max_seen_post_id = None else: current_round_max_seen_post_id = None return (pages, current_round_max_seen_post_id, aborted)