Esempio n. 1
0
def do_find_feed(
    ctx: ActorContext,
    feed_creation_id: T.int,
    url: T.url,
):
    # immediately send message to update status
    ctx.ask('harbor_rss.update_feed_creation_status', dict(
        feed_creation_id=feed_creation_id,
        status=FeedStatus.UPDATING,
    ))

    messages = []

    def message_handler(msg):
        LOG.info(msg)
        messages.append(msg)

    options = dict(message_handler=message_handler, **_get_proxy_options())
    options.update(allow_private_address=CONFIG.allow_private_address)
    with FeedFinder(url, **options) as finder:
        found = finder.find()
    try:
        feed = _parse_found(found) if found else None
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed url=%r: %s', unquote(url), ex, exc_info=ex)
        message_handler(f'invalid feed: {ex}')
        feed = None
    ctx.tell('harbor_rss.save_feed_creation_result', dict(
        feed_creation_id=feed_creation_id,
        messages=messages,
        feed=feed,
    ))
Esempio n. 2
0
def do_find_feed(
    ctx: ActorContext,
    feed_creation_id: T.int,
    url: T.url,
):
    # immediately send message to update status
    ctx.ask(
        'harbor_rss.update_feed_creation_status',
        dict(
            feed_creation_id=feed_creation_id,
            status=FeedStatus.UPDATING,
        ))

    messages = []

    def message_handler(msg):
        LOG.info(msg)
        messages.append(msg)

    options = dict(message_handler=message_handler, **_get_proxy_options())
    with FeedFinder(url, **options) as finder:
        found = finder.find()
    try:
        feed = _parse_found(found) if found else None
    except Invalid as ex:
        message_handler(f'invalid feed: {ex}')
        feed = None
    ctx.tell(
        'harbor_rss.save_feed_creation_result',
        dict(
            feed_creation_id=feed_creation_id,
            messages=messages,
            feed=feed,
        ))
Esempio n. 3
0
def do_save_registery(ctx: ActorContext):
    LOG.info('save registery info for {}'.format(
        ctx.registery.registery_node.name))
    registery_node = ctx.registery.registery_node.to_spec()
    nodes = ctx.registery.to_spec()
    Registery.create_or_update(registery_node, nodes)
    ctx.tell('scheduler.boardcast_registery')
Esempio n. 4
0
File: rss.py Progetto: XZYCR7/rssant
def do_sync_feed(
    ctx: ActorContext,
    feed_id: T.int,
    url: T.url,
    content_hash_base64: T.str.optional,
    etag: T.str.optional,
    last_modified: T.str.optional,
):
    params = dict(etag=etag, last_modified=last_modified)
    with FeedReader() as reader:
        status_code, response = reader.read(url, **params)
    LOG.info(f'read feed#{feed_id} url={unquote(url)} status_code={status_code}')
    if status_code != 200 or not response:
        return
    new_hash = compute_hash_base64(response.content)
    if new_hash == content_hash_base64:
        LOG.info(f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!')
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    parsed = FeedParser.parse_response(response)
    if parsed.bozo:
        LOG.warning(f'failed parse feed#{feed_id} url={unquote(url)}: {parsed.bozo_exception}')
        return
    try:
        feed = _parse_found(parsed)
    except Invalid as ex:
        LOG.warning(f'invalid feed#{feed_id} url={unquote(url)}: {ex}', exc_info=ex)
        return
    ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
Esempio n. 5
0
def do_ping(ctx: ActorContext, message: T.str) -> T.dict(message=T.str):
    LOG.info(ctx.message)
    r = ctx.ask('registery.query')
    LOG.info(r)
    ctx.tell('worker.pong', dict(message=message))
    if message == 'error':
        raise ValueError(message)
    return dict(message=message)
Esempio n. 6
0
def do_dns_service_refresh(ctx: ActorContext):
    DNS_SERVICE.refresh()
    records = {}
    for host, ip_set in DNS_SERVICE.records.items():
        records[host] = list(ip_set)
    msg = dict(records=records)
    expire_at = time.time() + 60 * 60
    for node in ctx.registery.remote_nodes:
        ctx.tell('actor.dns_service_update', msg, dst_node=node.name, expire_at=expire_at)
Esempio n. 7
0
def do_update_feed(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedSchema,
    is_refresh: T.bool.default(False).desc('Deprecated'),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        for k, v in feed_dict.items():
            if v != '' and v is not None:
                setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if not feed.dt_updated:
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id, storys)
        LOG.info(
            'feed#%s save storys total=%s num_modified=%s',
            feed.id, len(storys), len(modified_storys)
        )
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(feed, story)):
            ctx.tell('worker_rss.fetch_story', dict(
                url=story.link,
                story_id=str(story.id)
            ))
        else:
            _detect_story_images(ctx, story)
Esempio n. 8
0
def do_load_registery(ctx: ActorContext):
    registery_node = ctx.registery.registery_node.name
    LOG.info(f'load registery info for {registery_node}')
    registery = Registery.get(registery_node)
    if registery:
        ctx.registery.update(registery.node_specs)
        title = 'loaded'
    else:
        title = 'current'
    LOG.info(f'{title} registery info:\n' +
             pretty_format_json(ctx.registery.to_spec()))
    ctx.tell('scheduler.boardcast_registery')
Esempio n. 9
0
def do_sync_feed(
    ctx: ActorContext,
    feed_id: T.int,
    url: T.url,
    use_proxy: T.bool.default(False),
    content_hash_base64: T.str.optional,
    etag: T.str.optional,
    last_modified: T.str.optional,
):
    params = dict(etag=etag, last_modified=last_modified, use_proxy=use_proxy)
    options = _get_proxy_options()
    options.update(allow_private_address=CONFIG.allow_private_address)
    with FeedReader(**options) as reader:
        response = reader.read(url, **params)
    LOG.info(
        f'read feed#{feed_id} url={unquote(url)} response.status={response.status}'
    )
    if response.status != 200 or not response.content:
        return
    new_hash = compute_hash_base64(response.content)
    if new_hash == content_hash_base64:
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
        return
    try:
        feed = _parse_found((response, raw_result))
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        return
    ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
Esempio n. 10
0
def do_sync_feed(
        ctx: ActorContext,
        feed_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
        checksum_data: T.bytes.maxlen(4096).optional,
        content_hash_base64: T.str.optional,
        etag: T.str.optional,
        last_modified: T.str.optional,
        is_refresh: T.bool.default(False),
):
    params = {}
    if not is_refresh:
        params = dict(etag=etag, last_modified=last_modified)
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    switch_prob = 0.25  # the prob of switch from use proxy to not use proxy
    with FeedReader(**options) as reader:
        use_proxy = reader.has_proxy and use_proxy
        if use_proxy and random.random() < switch_prob:
            use_proxy = False
        response = reader.read(url, **params, use_proxy=use_proxy)
        LOG.info(
            f'read feed#{feed_id} url={unquote(url)} status={response.status}')
        need_proxy = FeedResponseStatus.is_need_proxy(response.status)
        if (not use_proxy) and reader.has_proxy and need_proxy:
            LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}')
            proxy_response = reader.read(url, **params, use_proxy=True)
            LOG.info(
                f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}'
            )
            if proxy_response.ok:
                response = proxy_response
    if (not response.ok) or (not response.content):
        status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR
        _update_feed_info(ctx, feed_id, status=status, response=response)
        return
    new_hash = compute_hash_base64(response.content)
    if (not is_refresh) and (new_hash == content_hash_base64):
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        _update_feed_info(ctx, feed_id, response=response)
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
    try:
        feed = _parse_found((response, raw_result),
                            checksum_data=checksum_data,
                            is_refresh=is_refresh)
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    ctx.tell('harbor_rss.update_feed',
             dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
Esempio n. 11
0
def do_load_registery(ctx: ActorContext):
    registery_node = ctx.registery.registery_node.name
    registery_info = pretty_format_json(ctx.registery.to_spec())
    LOG.info(f'load registery info for {registery_node}:\n' + registery_info)
    ctx.tell('scheduler.boardcast_registery')
Esempio n. 12
0
def do_save_registery(ctx: ActorContext):
    LOG.info('save registery info for {}'.format(
        ctx.registery.registery_node.name))
    ctx.tell('scheduler.boardcast_registery')
Esempio n. 13
0
def do_update_feed(
        ctx: ActorContext,
        feed_id: T.int,
        feed: FeedSchema,
        is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            # FIXME: feed merge 无法正确处理订阅重定向问题。
            # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。
            # if target_feed:
            #     LOG.info(f'merge feed#{feed.id} url={feed.url} into '
            #              f'feed#{target_feed.id} url={target_feed.url}')
            #     target_feed.merge(feed)
            #     return
            if target_feed:
                LOG.warning(
                    f'FIXME: redirect feed#{feed.id} url={feed.url!r} into '
                    f'feed#{target_feed.id} url={target_feed.url!r}')
                feed_dict.pop('url')
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if (v != '' and v is not None) or k in {'warnings'}:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.reverse_url = reverse_url(feed.url)
        feed.status = FeedStatus.READY
        feed.save()
    # save storys, bulk_save_by_feed has standalone transaction
    for s in storys:
        if not s['dt_updated']:
            s['dt_updated'] = now
        if not s['dt_published']:
            # set dt_published to now - 30d to avoid these storys
            # take over mushroom page, i.e. Story.query_recent_by_user
            s['dt_published'] = now_sub_30d
    modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id,
                                                      storys,
                                                      is_refresh=is_refresh)
    LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id,
             len(storys), len(modified_storys))
    feed = Feed.get_by_pk(feed_id)
    is_freezed = feed.freeze_level is None or feed.freeze_level > 1
    if modified_storys and is_freezed:
        Feed.unfreeze_by_id(feed_id)
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not _is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell(
                'worker_rss.fetch_story',
                dict(
                    url=story.link,
                    use_proxy=feed.use_proxy,
                    feed_id=story.feed_id,
                    offset=story.offset,
                    num_sub_sentences=num_sub_sentences,
                ))
Esempio n. 14
0
def do_update_feed(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedSchema,
    is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if v != '' and v is not None:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh)
        LOG.info(
            'feed#%s save storys total=%s num_modified=%s',
            feed.id, len(storys), len(modified_storys)
        )
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell('worker_rss.fetch_story', dict(
                url=story.link,
                use_proxy=feed.use_proxy,
                story_id=str(story.id),
                num_sub_sentences=num_sub_sentences,
            ))
        else:
            _detect_story_images(ctx, story)