def test_raw_parse_bad_encoding(): content = os.urandom(16 * 1024) response = _create_builder(content=content).build() parser = RawFeedParser() with pytest.raises(FeedParserError) as ex: parser.parse(response) assert ex
def test_raw_parse_warn(filename): response = _read_response(_data_dir / 'warn', filename) parser = RawFeedParser() result = parser.parse(response) assert result assert result.warnings and isinstance(result.warnings, list) assert result.storys assert result.feed['version'] assert result.feed['title']
def test_raw_parse_bad_encoding(): content = os.urandom(16 * 1024) builder = FeedResponseBuilder() builder.url('https://blog.example.com/feed') builder.content(content) response = builder.build() parser = RawFeedParser() with pytest.raises(FeedParserError) as ex: parser.parse(response) assert ex
def test_parser_and_checksum(filepath): response = _read_response(_data_dir, filepath) raw_parser = RawFeedParser() raw_result = raw_parser.parse(response) assert raw_result.feed assert raw_result.storys parser = FeedParser() result = parser.parse(raw_result) assert result.feed assert result.storys assert result.checksum.size() == len(result.storys)
def test_raw_parser_incomplete_content(): response = _read_response(_data_dir / 'warn', 'https-tmioe-com-feed.xml') parser = RawFeedParser() result = parser.parse(response) assert len(result.storys) == 5 assert result.feed['version'] == 'rss20' assert result.feed['title'] == 'ZAPRO · 杂铺' expect_title = "TikTok 抖音国际版 v18.6.2 解锁全部国家任意切换" expect_url = "https://tmioe.com/1463.html" got_storys = [x for x in result.storys if x['url'] == expect_url] assert got_storys assert got_storys[0]['title'] == expect_title
def test_parse_story_no_id_no_summary_no_url(): # total 3 storys # skip the no id story # story#0 no content no summary, no url # story#1 has content no summary, no url but id is valid url filename = 'well/v2ex-no-id-no-summary-no-url.xml' response = _read_response(_data_dir, filename) raw_result = RawFeedParser().parse(response) assert raw_result.storys # assert skip the no id story assert len(raw_result.storys) == 2 # assert no summary assert not raw_result.storys[0]['summary'] assert not raw_result.storys[1]['summary'] # assert content assert not raw_result.storys[0]['content'] assert raw_result.storys[1]['content'] # assert pick id as url, discard the invalid one assert not raw_result.storys[0]['url'] assert raw_result.storys[1]['url'] result = FeedParser().parse(raw_result) assert result.storys assert len(raw_result.storys) == len(result.storys) # assert content assert not result.storys[0]['content'] assert result.storys[1]['content'] # assert extract summary from content assert not result.storys[0]['summary'] assert result.storys[1]['summary'] # assert pick id as url, discard the invalid one assert not result.storys[0]['url'] assert result.storys[1]['url']
def test_parse_too_many_storys(): items = [] num_storys = 2000 base = datetime.datetime.now() for i in range(num_storys): if i < num_storys // 2: date_published = None else: date_published = (base + datetime.timedelta(seconds=i)).isoformat() items.append({ "id": f"{i}", "content_html": f"content_{i}", "summary": f"summary_{i}", "url": f"https://example.org/post/{i}", "date_published": date_published, }) feed = { "version": "https://jsonfeed.org/version/1", "title": "Too many storys", "home_page_url": "https://example.org/", "feed_url": "https://example.org/feed.json", "items": items } data = json.dumps(feed).encode('utf-8') response = _create_builder(data).build() raw_result = RawFeedParser().parse(response) assert len(raw_result.storys) == num_storys result = FeedParser().parse(raw_result) assert len(result.storys) == _MAX_STORYS expected = set(range(num_storys - _MAX_STORYS, num_storys)) story_ids = {int(x['ident']) for x in result.storys} assert story_ids == expected
def _parse_well_feed(filename) -> FeedResult: response = _read_response(_data_dir / 'well', filename) raw_result = RawFeedParser().parse(response) assert raw_result.feed assert raw_result.storys assert not raw_result.warnings result = FeedParser().parse(raw_result) assert len(result.storys) == len(raw_result.storys) return result
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, ): params = dict(etag=etag, last_modified=last_modified, use_proxy=use_proxy) options = _get_proxy_options() options.update(allow_private_address=CONFIG.allow_private_address) with FeedReader(**options) as reader: response = reader.read(url, **params) LOG.info( f'read feed#{feed_id} url={unquote(url)} response.status={response.status}' ) if response.status != 200 or not response.content: return new_hash = compute_hash_base64(response.content) if new_hash == content_hash_base64: LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) return try: feed = _parse_found((response, raw_result)) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
def test_raw_parse_date_timestamp(): content = ''' <rss version="2.0"> <channel> <title>博客中国</title> <generator>http://www.blogchina.com</generator> <item> <title><![CDATA[ 新能源车崛起,传统汽车如何避免诺基亚式危机? ]]></title> <link>http://jianghulaoliu.blogchina.com/956463775.html</link> <description><![CDATA[ 近日,特斯拉降价16万使得特斯拉消费激增 ]]></description> <source></source> <pubDate>1611146768</pubDate> </item> '''.encode('utf-8') response = _create_builder(content=content).build() result = RawFeedParser().parse(response) assert len(result.storys) == 1 dt: datetime.datetime = result.storys[0]['dt_published'] assert dt == datetime.datetime.fromtimestamp(1611146768, tz=UTC)
def test_parse_large_content(template_name, content_length, summary_length): content_snip = "<span>12345678</span>" summary_snip = '<span>123</span>' content_repeat = (content_length // len(content_snip)) + 1 content = content_snip * content_repeat summary_repeat = (summary_length // len(summary_snip)) + 1 summary = summary_snip * summary_repeat template = large_feed_templates[template_name] # use replace instead format to avoid KeyError for json string data = template\ .replace('${content}', content)\ .replace('${summary}', summary)\ .encode('utf-8') response = _create_builder(content=data).build() raw_result = RawFeedParser().parse(response) assert raw_result and len(raw_result.storys) == 1 assert len(raw_result.storys[0]['content']) <= _RAW_MAX_CONTENT_LENGTH assert len(raw_result.storys[0]['summary']) <= _RAW_MAX_SUMMARY_LENGTH result = FeedParser().parse(raw_result) assert result and len(result.storys) == 1 assert len(result.storys[0]['content']) <= _MAX_CONTENT_LENGTH assert len(result.storys[0]['summary']) <= _MAX_SUMMARY_LENGTH
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), checksum_data: T.bytes.maxlen(4096).optional, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, is_refresh: T.bool.default(False), ): params = {} if not is_refresh: params = dict(etag=etag, last_modified=last_modified) options = _proxy_helper.get_proxy_options() if DNS_SERVICE.is_resolved_url(url): use_proxy = False switch_prob = 0.25 # the prob of switch from use proxy to not use proxy with FeedReader(**options) as reader: use_proxy = reader.has_proxy and use_proxy if use_proxy and random.random() < switch_prob: use_proxy = False response = reader.read(url, **params, use_proxy=use_proxy) LOG.info( f'read feed#{feed_id} url={unquote(url)} status={response.status}') need_proxy = FeedResponseStatus.is_need_proxy(response.status) if (not use_proxy) and reader.has_proxy and need_proxy: LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}') proxy_response = reader.read(url, **params, use_proxy=True) LOG.info( f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}' ) if proxy_response.ok: response = proxy_response if (not response.ok) or (not response.content): status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR _update_feed_info(ctx, feed_id, status=status, response=response) return new_hash = compute_hash_base64(response.content) if (not is_refresh) and (new_hash == content_hash_base64): LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) _update_feed_info(ctx, feed_id, response=response) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) try: feed = _parse_found((response, raw_result), checksum_data=checksum_data, is_refresh=is_refresh) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
def test_raw_parse_failed(filename): response = _read_response(_data_dir / 'failed', filename) parser = RawFeedParser() with pytest.raises(FeedParserError) as ex: parser.parse(response) assert ex