Ejemplo n.º 1
0
    def test_thread_page_reverse_walker_with_login(self):
        if SimpleTest.user_hash == None:
            self.skipTest(reason="需要登录")

        client = self.new_client()

        (page100, _) = client.get_thread_page(29184693, page=100)
        gatekeeper_post_id = list(page100.replies)[-1].id

        page_count = 0
        for (n, page, _) in create_walker(
            target=ReversalThreadWalkTarget(
                thread_id=29184693,
                gatekeeper_post_id=gatekeeper_post_id,
                start_page_number=101,
                stop_before_post_id=gatekeeper_post_id,
                expected_stop_page_number=100,
            ),
            client=client,
            options={
                "user_cookie": self.user_cookie,
            },
        ):
            self.assertTrue(n in [100, 101])
            page_count += 1
            if n == 100:
                self.assertEqual(len(page.replies), 0)
        self.assertEqual(page_count, 2)
Ejemplo n.º 2
0
def rescan_board(args: argparse.Namespace, db: DB, client: anobbsclient.Client,
                 stats: Stats):
    # 用于检测当天消失的串,
    # 但如果当天消失的串最后上浮的时间在当天之前,就无法检测到了

    thread_ids_seen_today = set(db.get_thread_ids_seen_since(args.since))

    walker = create_walker(
        target=BoardWalkTarget(
            start_page_number=1,
            board_id=args.board_id,
            stop_before_datetime=args.since,
        ),
        client=client,
    )

    for (_, page, usage) in walker:
        page: List[anobbsclient.BoardThread] = page
        now = datetime.now(tz=local_tz)
        stats.board_request_count += 1
        stats.total_bandwidth_usage.add(usage)

        for thread in page:
            thread_ids_seen_today.discard(thread.id)
            db.record_thread(thread, board_id=args.board_id, updated_at=now)
            db.report_is_thread_disappeared(thread.id, now, False)

    for not_found_thread_id in thread_ids_seen_today:
        # 只若先前没有发现消失,才会对此更新
        if not db.is_thread_disappeared(not_found_thread_id):
            logging.info(f"发现 {not_found_thread_id} 消失")
            db.report_is_thread_disappeared(not_found_thread_id, now, True)
def find_last_post_with_uuid(
        client: anobbsclient.Client,
        thread_id: int) -> Optional[Tuple[int, int, str, int]]:
    """
    Returns
    -------
    [0] : int
        报告的页数。

        不是回应所在的页数
    [1] : int
        目标回应的串号。
    [2] : str
        找到的 UUID。
    [3] : int
        目标回应的偏移。

    如果没找到或者找到的第一个 uuid 不匹配,返回 None。
    """

    # TODO: 可以根据上一次回应所在位置预测一下,大部分情况能把请求减少到1次
    # TODO: 如果发现串 SAGE 了,以后就不发了,或者提前检查一下有无 SAGE?

    (page_1, _) = client.get_thread_page(id=thread_id, page=1, for_analysis=1)
    page_1: anobbsclient.ThreadPage = page_1
    # TODO: 其实这个可以在 API 那边定义 property 来算吧
    total_pages = (page_1.body.total_reply_count - 1) // 19 + 1

    walker = create_walker(
        target=ReversalThreadWalkTarget(
            thread_id=thread_id,
            gatekeeper_post_id=None,
            start_page_number=total_pages,
        ),
        client=client,
    )

    for (pn, page, _) in walker:
        page: anobbsclient.ThreadPage = page
        for (i, post) in enumerate(reversed(page.replies)):
            text = BeautifulSoup(post.content).text
            uuid_rx = re.compile(
                r"(?:.*\n)+" + META_MAIN_DIVIDER + r"\n" +
                r"(?:.*\n)+Report ID = ([0-9a-f\-]+).*(?:\n.*)*",
                re.MULTILINE,
            )
            result = uuid_rx.match(text)
            if result is None:
                continue
            uuid = result.group(1)

            report_pn = int(re.match(r"页 ❬(\d+) / \d+❭", post.name).group(1))

            return (report_pn, post.id, uuid,
                    (pn - 1) * 19 + 1 + (len(page.replies) - 1 - i))

    return None
Ejemplo n.º 4
0
 def case_no_login():
     for (_, _, _) in create_walker(
         target=ReversalThreadWalkTarget(
             thread_id=29184693,
             gatekeeper_post_id=99999999,
             start_page_number=101,
         ),
         client=client,
     ):
         assert(False)
Ejemplo n.º 5
0
    def test_thread_page_reverse_walker(self):

        client = self.new_client()

        walker = create_walker(
            target=ReversalThreadWalkTarget(
                thread_id=29184693,
                gatekeeper_post_id=99999999,
                start_page_number=3,
            ),
            client=client,
        )

        page_count, last_page_max_post_id = 0,  None
        for (n, page, _) in walker:
            self.assertTrue(n in [1, 2, 3])
            page_count += 1
            if n == 3:
                last_page_max_post_id = page.replies[-1].id
        self.assertEqual(page_count, 3)

        walker = create_walker(
            target=ReversalThreadWalkTarget(
                thread_id=29184693,
                gatekeeper_post_id=99999999,
                start_page_number=4,
                stop_before_post_id=last_page_max_post_id,
                expected_stop_page_number=3,
            ),
            client=client,
        )

        page_count = 0
        for (n, page, _) in walker:
            self.assertTrue(n in [3, 4])
            page_count += 1
            if n == 3:
                self.assertEqual(len(page.replies), 0)
        self.assertEqual(page_count, 2)
Ejemplo n.º 6
0
 def case_gatekept():
     for (_, _, _) in create_walker(
         target=ReversalThreadWalkTarget(
             thread_id=29184693,
             gatekeeper_post_id=gatekeeper_post_id,
             start_page_number=101,
         ),
         client=client,
         options={
             "user_cookie": anobbsclient.UserCookie(
                 userhash="",  # 无效的饼干
             ),
         },
     ):
         assert(False)
Ejemplo n.º 7
0
    def test_board_page_walker(self):

        client = self.new_client()

        now = datetime.now(local_tz)
        two_hours_ago = now - timedelta(hours=2)

        for (_, page, _) in create_walker(
            target=BoardWalkTarget(
                board_id=111,
                start_page_number=1,
                stop_before_datetime=two_hours_ago,
            ),
            client=client,
        ):
            page: anobbsclient.BoardThread = page
            for thread in page:
                self.assertGreaterEqual(
                    thread.last_modified_time, two_hours_ago)
Ejemplo n.º 8
0
    def test_thread_page_reverse_walker_stop_before_datetime(self):

        client = self.new_client()

        page_count = 0
        for (n, page, _) in create_walker(
            target=ReversalThreadWalkTarget(
                thread_id=29184693,
                gatekeeper_post_id=99999999,
                start_page_number=98,
                stop_before_datetime=datetime(  # 2020-08-09(日)22:00:21
                    year=2020, month=8, day=9,
                    hour=22, minute=00, second=21,
                    tzinfo=local_tz,
                )
            ),
            client=client,
        ):
            page: anobbsclient.ThreadPage = page
            self.assertTrue(n in [96, 97, 98])
            page_count += 1
            if n == 96:
                self.assertEqual(page.replies[0].id, 29279607)
        self.assertEqual(page_count, 3)
Ejemplo n.º 9
0
def fetch_board(db: DB, activity: Activity, client: anobbsclient.Client,
                board_id: int, fetching_since: datetime, stats: Stats):

    logger = logging.getLogger('FETCH')

    walker = create_walker(
        target=BoardWalkTarget(
            board_id=board_id,
            start_page_number=1,
            stop_before_datetime=fetching_since,
        ),
        client=client,
    )
    is_first_found_thread = True
    threads_on_board: List[anobbsclient.BoardThread] = []
    bandwidth_usage_for_board = TotalBandwidthUsage()
    for (pn, page, usage) in walker:
        logger.info(f'获取到版块第 {pn} 页。纳入串数 = {len(page)}')
        bandwidth_usage_for_board.add(usage)
        stats.board_request_count += 1
        threads_on_board += page
    stats.total_bandwidth_usage.add(bandwidth_usage_for_board.total)
    logger.info(f'完成获取版块。总共纳入串数 = {len(threads_on_board)},'
                + f'期间 (上传字节数, 下载字节数) = {bandwidth_usage_for_board.total}')

    now = datetime.now(tz=local_tz)

    for (i, thread) in enumerate(threads_on_board):
        logger.debug(f'串 #{i}。串号 = {thread.id},'
                     + f'最后修改时间 = {thread.last_modified_time}')

        if is_first_found_thread:
            is_first_found_thread = False
            activity.report_collecting_range(
                since=fetching_since, until=thread.last_modified_time)

        is_thread_recorded = db.is_thread_recorded(thread.id)
        if not is_thread_recorded:
            stats.new_thread_count += 1
        # 记录或更新串
        # current_reply_count 在后面一同记录
        db.record_thread(thread, board_id=board_id, updated_at=now)

        if len(thread.replies) == 0:
            assert(thread.total_reply_count == 0)
            logger.debug(f'串 #{i} 暂无回应,到此结束')
            continue

        # 根据数据库中是否已存在该串之前抓取到的回应,
        # 来决定如何判断某回应是否是抓取目标
        latest_seen_reply_id = \
            db.try_find_thread_latest_seen_reply_id(thread_id=thread.id)
        has_old_records = latest_seen_reply_id is not None
        if has_old_records:
            def is_target(x): return x.id > latest_seen_reply_id
            logger.debug(f'串 #{i} 是之前已经抓取过的串,'
                         + f'将会通过之前抓取到的最大串号作为范围的下界')
        else:
            def is_target(x): return x.created_at >= fetching_since
            logger.debug(f'串 #{i} 是之前曾未抓取过的串,'
                         + f'将会通过规定的下界时间作为范围的下界')

        new_responses_in_preview = list(
            [post for post in thread.replies if is_target(post)])
        if thread.total_reply_count <= 5 \
                or not is_target(thread.replies[0]):
            # 要抓取的内容全在预览里,不用再进串里去翻了
            # TODO 判断是否没有剩余回应(len(thread.total_reply_count) <= 5)应该在 API 那边进行
            if len(new_responses_in_preview) > 0:
                if is_thread_recorded:
                    stats.affected_thread_count += 1
                stats.new_post_count += len(new_responses_in_preview)
            db.record_thread_replies(thread=thread, replies=new_responses_in_preview,
                                     total_reply_count=thread.total_reply_count,
                                     updated_at=now)
            logger.debug(f'串 #{i} 由于全部需要抓取的回应已在预览之中,记录后到此结束。')
        else:
            # 反向遍历
            start_page_number = (thread.total_reply_count - 1) // 19 + 1
            logger.debug(f'串 #{i} 需要进入以抓取目标范围内的回应。' +
                         f'从回应总数推测出的当前页数 = {start_page_number}')
            if (thread.total_reply_count % 19) <= 5:
                # 最新一页的内容已经全部包含在预览中了,因此略过
                logger.debug(f'串 #{i} 由于最新一页的回应已全部包含在预览中,抓取时会略过该页')
                start_page_number -= 1

            needs_gatekeeper_post_id = False
            if has_old_records:
                last_reply_count = \
                    db.get_thread_total_reply_count(thread_id=thread.id)
                if last_reply_count is not None:
                    last_page_count = (last_reply_count - 1) // 19 + 1
                else:
                    last_page_count = None
                    logger.warning(f'串 #{i} 存在曾抓取到的回应,但却没有记录回应总数')
                if (last_page_count is None or not client.thread_page_requires_login(last_page_count)) \
                        and client.thread_page_requires_login(start_page_number):
                    needs_gatekeeper_post_id = True
                    logger.debug(f'串 #{i} 由于要抓取的内容需要登录,'
                                 + f'而之前抓取到的内容在需要登录之前,无法用以判断是否卡页,'
                                 + f'因而需要额外获取第 100 页来确认守门串号')
            elif client.thread_page_requires_login(start_page_number):
                needs_gatekeeper_post_id = True
                logger.debug(f'串 #{i} 由于要抓取的内容需要登录,'
                             + f'而之前曾未抓取过内容,无法用以判断是否卡页,'
                             + f'因而需要额外获取第 100 页来确认守门串号')

            if needs_gatekeeper_post_id:
                # TODO: 这一块应该放在 API 那边
                (gatekeeper_page, usage) = client.get_thread_page(
                    id=thread.id, page=client.get_thread_gatekeeper_page_number())
                stats.total_bandwidth_usage.add(usage)
                stats.thread_request_count += 1
                gatekeeper_post_id = gatekeeper_page.replies[-1].id
                logger.debug(f'串 #{i} 确认守门串号。守门串号 = {gatekeeper_post_id}')
            else:
                gatekeeper_post_id = None

            if has_old_records:
                walker = create_walker(
                    target=ReversalThreadWalkTarget(
                        thread_id=thread.id,
                        start_page_number=start_page_number,
                        gatekeeper_post_id=gatekeeper_post_id,
                        stop_before_post_id=latest_seen_reply_id,
                        expected_stop_page_number=last_page_count,
                    ),
                    client=client,
                )
            else:
                walker = create_walker(
                    target=ReversalThreadWalkTarget(
                        thread_id=thread.id,
                        start_page_number=start_page_number,
                        gatekeeper_post_id=gatekeeper_post_id,
                        stop_before_datetime=fetching_since,
                    ),
                    client=client,
                )

            final_reply_count = None
            targets = []
            bandwidth_usage_for_thread = TotalBandwidthUsage()
            thread_walk_page_count = 0
            for (pn, page, usage) in walker:

                thread_walk_page_count += 1
                stats.thread_request_count += 1
                if client.thread_page_requires_login(pn):
                    stats.logged_in_thread_request_count += 1
                logger.debug(f'串 #{i} 页 {pn}。纳入回应数 = {len(page.replies)}')
                page: anobbsclient.ThreadPage = page
                bandwidth_usage_for_thread.add(usage)
                if final_reply_count is None:
                    final_reply_count = page.body.total_reply_count
                targets += page.replies
            targets += new_responses_in_preview
            now_after_fetching_inside_thread = datetime.now(tz=local_tz)
            db.record_thread_replies(thread=thread, replies=targets,
                                     total_reply_count=final_reply_count,
                                     updated_at=now_after_fetching_inside_thread)
            stats.total_bandwidth_usage.add(bandwidth_usage_for_thread.total)
            if len(targets) > 0:
                if is_thread_recorded:
                    stats.affected_thread_count += 1
                stats.new_post_count += len(targets)
            logger.debug(f'串 #{i} 已抓取到范围内所有新回应,记录后到此结束。'
                         + f'遍历访问页数 = {thread_walk_page_count},'
                         + f'期间 (上传字节数, 下载字节数) = {bandwidth_usage_for_thread.total}')
Ejemplo n.º 10
0
def fetch_page_range_back_to_front(
    client: anobbsclient.Client, thread_id: int,
    from_upper_bound_page_number: int, to_lower_bound_page_number: int,
    lower_bound_post_id: int, gatekeeper_post_id: Optional[int]
) -> Tuple[List[Page], Optional[int], bool]:
    """
    Returns
    -------
    Optional[List[Page]]
        获取到的页面的列表。
        如果为空,代表本轮结果被抛弃。

    Optional[int]
        本轮见到的最大的串号。

    bool
        是否应该中断。
        如果为真,则处理完本轮后应该终止程序。
    """

    # 过程是否被中断。
    # 如果为真,页数最小的那页会是 `.previous-page-unchecked`
    aborted = False
    # 是否应该抛弃已经获取到的各页,以防止损害已有数据
    should_abandon = False

    pages: List[Page] = []

    try:
        for (n, page, _) in create_walker(
                target=ReversalThreadWalkTarget(
                    thread_id=thread_id,
                    start_page_number=from_upper_bound_page_number,
                    gatekeeper_post_id=gatekeeper_post_id,
                    stop_before_post_id=lower_bound_post_id,
                    expected_stop_page_number=to_lower_bound_page_number,
                ),
                client=client,
        ):
            msg = f"范围:从第{from_upper_bound_page_number}页至第{to_lower_bound_page_number}页,"
            msg += f"获取处理:第{n}页"
            if n < to_lower_bound_page_number:
                msg += f"(将合并至第{to_lower_bound_page_number}页)"
            logging.info(msg)
            if len(pages) == 0 or pages[
                    -1].page_number != to_lower_bound_page_number:
                pages.append(
                    Page(
                        thread_body=page.body,
                        page_number=n,
                        replies=page.replies,
                    ))
            else:
                pages[-1].replies.extend(page.replies)
            logging.info(f"获取完成:第{n}页")
    except KeyboardInterrupt:
        logging.warning("收到用户键盘中断,将中断")
        aborted = True
    except anobbsclient.RequiresLoginException:
        logging.error("未登陆,将中断")
        aborted, should_abandon = True, True
    except anobbsclient.GatekeptException as e:
        logging.error(
            f"出现「卡99」现象,疑似登陆失效,将中断。当前页面页数:{e.current_page_number},上下文:{e.context},守门串号:{e.gatekeeper_post_id}"
        )
        aborted, should_abandon = True, True
    except anobbsclient.UnreachableLowerBoundPostIDException as e:
        logging.error(f"由于不明原因,无法到达预定的下界串号,将中断。下界串号: {e.lower_bound_post_id}")
        aborted, should_abandon = True, True
    except anobbsclient.UnexpectedLowerBoundPostIDException as e:
        logging.error(
            f"在预期之外的大于页数下界的页面遇到了下界串号 {e.lower_bound_post_id},当前页面页数:{e.current_page_number},页数下界:{e.expected_lower_bound_page_number}"
        )
        aborted, should_abandon = True, True

    if should_abandon:
        logging.error("将遗弃已获取的页面")
        pages = None

    # 本轮见过的最大的串号。
    # 由于外层每一轮是从前向后的顺序进行处理,
    # 当本轮页数超过守门页时,可以让下一轮有效检测「卡99」
    if pages != None:
        if len(pages[0].replies) != 0:
            current_round_max_seen_post_id = int(pages[0].replies[-1].id)
        elif len(pages) > 1:
            current_round_max_seen_post_id = int(pages[1].replies[-1].id)
        else:
            current_round_max_seen_post_id = None
    else:
        current_round_max_seen_post_id = None

    return (pages, current_round_max_seen_post_id, aborted)