Beispiel #1
0
def story_fetch_fulltext(
    request,
    feed_id: T.feed_unionid.object,
    offset: T.int.min(0),
) -> T.dict(
        feed_id=T.feed_unionid,
        offset=T.int.min(0),
        response_status=T.int,
        response_status_name=T.str,
        use_proxy=T.bool.optional,
        accept=T_ACCEPT.optional,
        story=StorySchema.optional,
):
    feed_unionid = feed_id
    check_unionid(request, feed_unionid)
    user_id, feed_id = feed_unionid
    content = dict(feed_id=feed_id, offset=offset)
    expire_at = int(time.time() + 60)
    use_proxy = None
    accept = None
    try:
        result = scheduler.ask('harbor_rss.sync_story_fulltext',
                               content,
                               expire_at=expire_at)
    except _TIMEOUT_ERRORS as ex:
        LOG.error(f'Ask harbor_rss.sync_story_fulltext timeout: {ex}')
        response_status = FeedResponseStatus.CONNECTION_TIMEOUT
    else:
        response_status = result['response_status']
        use_proxy = result['use_proxy']
        accept = result['accept']
    story = None
    if accept != FulltextAcceptStrategy.REJECT.value:
        story = UnionStory.get_by_feed_offset(feed_unionid,
                                              offset,
                                              detail=True)
        story = story.to_dict()
    response_status_name = FeedResponseStatus.name_of(response_status)
    return dict(
        feed_id=feed_unionid,
        offset=offset,
        response_status=response_status,
        response_status_name=response_status_name,
        use_proxy=use_proxy,
        accept=accept,
        story=story,
    )
Beispiel #2
0
def do_sync_feed(
        ctx: ActorContext,
        feed_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
        checksum_data: T.bytes.maxlen(4096).optional,
        content_hash_base64: T.str.optional,
        etag: T.str.optional,
        last_modified: T.str.optional,
        is_refresh: T.bool.default(False),
):
    params = {}
    if not is_refresh:
        params = dict(etag=etag, last_modified=last_modified)
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    switch_prob = 0.25  # the prob of switch from use proxy to not use proxy
    with FeedReader(**options) as reader:
        use_proxy = reader.has_proxy and use_proxy
        if use_proxy and random.random() < switch_prob:
            use_proxy = False
        response = reader.read(url, **params, use_proxy=use_proxy)
        LOG.info(
            f'read feed#{feed_id} url={unquote(url)} status={response.status}')
        need_proxy = FeedResponseStatus.is_need_proxy(response.status)
        if (not use_proxy) and reader.has_proxy and need_proxy:
            LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}')
            proxy_response = reader.read(url, **params, use_proxy=True)
            LOG.info(
                f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}'
            )
            if proxy_response.ok:
                response = proxy_response
    if (not response.ok) or (not response.content):
        status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR
        _update_feed_info(ctx, feed_id, status=status, response=response)
        return
    new_hash = compute_hash_base64(response.content)
    if (not is_refresh) and (new_hash == content_hash_base64):
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        _update_feed_info(ctx, feed_id, response=response)
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
    try:
        feed = _parse_found((response, raw_result),
                            checksum_data=checksum_data,
                            is_refresh=is_refresh)
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    ctx.tell('harbor_rss.update_feed',
             dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
Beispiel #3
0
 def response_status_name(self) -> str:
     if self.response_status is None:
         return None
     return FeedResponseStatus.name_of(self.response_status)