Ejemplo n.º 1
0
def oauth_api_request(method, url, **kwargs):
    """
    when network error, fallback to use rss proxy
    """
    options = _proxy_helper.get_proxy_options()
    client = RSSProxyClient(**options, proxy_strategy=_proxy_strategy)
    return client.request(method, url, **kwargs)
Ejemplo n.º 2
0
def _setup():
    _proxy_options = _proxy_helper.get_proxy_options()
    service = DNSService.create(
        **_proxy_options,
        allow_private_address=CONFIG.allow_private_address,
    )
    return service
Ejemplo n.º 3
0
def test_find_real(start_url: str):
    finder, messages = _create_finder(
        start_url,
        **_proxy_helper.get_proxy_options(),
    )
    with finder:
        found = finder.find()
        if found:
            response, result = found
            print(f"Got: response={response} result={result}")
        assert found, messages
Ejemplo n.º 4
0
async def do_fetch_story(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int.min(0),
    url: T.url,
    use_proxy: T.bool.default(False),
    num_sub_sentences: T.int.optional,
) -> SCHEMA_FETCH_STORY_RESULT:
    LOG.info(f'fetch story#{feed_id},{offset} url={unquote(url)} begin')
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    # make timeout less than actor default 30s to avoid ask timeout
    options.update(request_timeout=25)
    async with AsyncFeedReader(**options) as reader:
        use_proxy = use_proxy and reader.has_proxy
        url, content, response = await _fetch_story(reader,
                                                    feed_id,
                                                    offset,
                                                    url,
                                                    use_proxy=use_proxy)
    DEFAULT_RESULT = dict(feed_id=feed_id,
                          offset=offset,
                          url=url,
                          response_status=response.status,
                          use_proxy=response.use_proxy)
    if not content:
        return DEFAULT_RESULT
    if len(content) >= _MAX_STORY_HTML_LENGTH:
        content = story_html_clean(content)
        if len(content) >= _MAX_STORY_HTML_LENGTH:
            msg = 'too large story#%s,%s size=%s url=%r'
            LOG.warning(msg, feed_id, offset, len(content), url)
            content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH]
    msg_func = ctx.ask if ctx.message.is_ask else ctx.hope
    result = await msg_func(
        'worker_rss.process_story_webpage',
        dict(
            feed_id=feed_id,
            offset=offset,
            url=url,
            text=content,
            num_sub_sentences=num_sub_sentences,
        ))
    if not ctx.message.is_ask:
        return DEFAULT_RESULT
    result.update(DEFAULT_RESULT)
    return result
Ejemplo n.º 5
0
def update_feed_use_proxy():
    if not CONFIG.rss_proxy_enable:
        click.echo('rss proxy not enable!')
        return
    blacklist = [
        '%博客园%',
        '%微信%',
        '%新浪%',
        '%的评论%',
        '%Comments on%',
    ]
    sql = """
    select * from rssant_api_feed
    where (NOT title LIKE ANY(%s)) AND (
        dt_created >= '2020-04-01' or
        (total_storys <= 5 and dt_updated <= '2019-12-01')
    )
    """
    feeds = list(Feed.objects.raw(sql, [blacklist]))
    click.echo(f'{len(feeds)} feeds need check')
    reader = FeedReader(**_proxy_helper.get_proxy_options())
    proxy_feeds = []
    with reader:
        for i, feed in enumerate(feeds):
            click.echo(f'#{i} {feed}')
            status = reader.read(feed.url).status
            click.echo(f'    #{i} status={FeedResponseStatus.name_of(status)}')
            if FeedResponseStatus.is_need_proxy(status):
                proxy_status = reader.read(feed.url, use_proxy=True).status
                click.echo(
                    f'    #{i} proxy_status={FeedResponseStatus.name_of(proxy_status)}'
                )
                if proxy_status == 200:
                    proxy_feeds.append(feed)
    click.echo(f'{len(proxy_feeds)} feeds need use proxy')
    if proxy_feeds:
        with transaction.atomic():
            for feed in tqdm.tqdm(proxy_feeds, ncols=80, ascii=True):
                feed.refresh_from_db()
                feed.use_proxy = True
                feed.save()
Ejemplo n.º 6
0
def do_find_feed(
    ctx: ActorContext,
    feed_creation_id: T.int,
    url: T.url,
):
    # immediately send message to update status
    ctx.ask(
        'harbor_rss.update_feed_creation_status',
        dict(
            feed_creation_id=feed_creation_id,
            status=FeedStatus.UPDATING,
        ))

    messages = []

    def message_handler(msg):
        LOG.info(msg)
        messages.append(msg)

    options = _proxy_helper.get_proxy_options()
    options.update(message_handler=message_handler)
    options.update(request_timeout=CONFIG.feed_reader_request_timeout)
    options.update(dns_service=DNS_SERVICE)
    with FeedFinder(url, **options) as finder:
        found = finder.find()
    try:
        feed = _parse_found(found) if found else None
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed url=%r: %s', unquote(url), ex, exc_info=ex)
        message_handler(f'invalid feed: {ex}')
        feed = None
    ctx.tell(
        'harbor_rss.save_feed_creation_result',
        dict(
            feed_creation_id=feed_creation_id,
            messages=messages,
            feed=feed,
        ))
Ejemplo n.º 7
0
Archivo: rss.py Proyecto: zuzhi/rssant
async def do_fetch_story(
        ctx: ActorContext,
        feed_id: T.int,
        offset: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
        num_sub_sentences: T.int.optional,
):
    LOG.info(f'fetch story#{feed_id},{offset} url={unquote(url)} begin')
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    async with AsyncFeedReader(**options) as reader:
        use_proxy = use_proxy and reader.has_proxy
        url_content = await _fetch_story(reader,
                                         feed_id,
                                         offset,
                                         url,
                                         use_proxy=use_proxy)
    if not url_content:
        return
    url, content = url_content
    if len(content) >= _MAX_STORY_HTML_LENGTH:
        content = story_html_clean(content)
        if len(content) >= _MAX_STORY_HTML_LENGTH:
            msg = 'too large story#%s,%s size=%s url=%r'
            LOG.warning(msg, feed_id, offset, len(content), url)
            content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH]
    await ctx.hope(
        'worker_rss.process_story_webpage',
        dict(
            feed_id=feed_id,
            offset=offset,
            url=url,
            text=content,
            num_sub_sentences=num_sub_sentences,
        ))
Ejemplo n.º 8
0
def do_sync_feed(
        ctx: ActorContext,
        feed_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
        checksum_data: T.bytes.maxlen(4096).optional,
        content_hash_base64: T.str.optional,
        etag: T.str.optional,
        last_modified: T.str.optional,
        is_refresh: T.bool.default(False),
):
    params = {}
    if not is_refresh:
        params = dict(etag=etag, last_modified=last_modified)
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    switch_prob = 0.25  # the prob of switch from use proxy to not use proxy
    with FeedReader(**options) as reader:
        use_proxy = reader.has_proxy and use_proxy
        if use_proxy and random.random() < switch_prob:
            use_proxy = False
        response = reader.read(url, **params, use_proxy=use_proxy)
        LOG.info(
            f'read feed#{feed_id} url={unquote(url)} status={response.status}')
        need_proxy = FeedResponseStatus.is_need_proxy(response.status)
        if (not use_proxy) and reader.has_proxy and need_proxy:
            LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}')
            proxy_response = reader.read(url, **params, use_proxy=True)
            LOG.info(
                f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}'
            )
            if proxy_response.ok:
                response = proxy_response
    if (not response.ok) or (not response.content):
        status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR
        _update_feed_info(ctx, feed_id, status=status, response=response)
        return
    new_hash = compute_hash_base64(response.content)
    if (not is_refresh) and (new_hash == content_hash_base64):
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        _update_feed_info(ctx, feed_id, response=response)
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
    try:
        feed = _parse_found((response, raw_result),
                            checksum_data=checksum_data,
                            is_refresh=is_refresh)
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    ctx.tell('harbor_rss.update_feed',
             dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))