def test_search_entries_order_content(reader): parser = Parser() reader._parser = parser feed = parser.feed(1, datetime(2010, 1, 1)) one = parser.entry( 1, 1, datetime(2010, 1, 1), summary='word word', content=[ Content('word'), Content('does not match'), Content('word word word word'), Content('word word word'), ], ) reader.add_feed(feed.url) reader.update_feeds() reader.enable_search() reader.update_search() # there should be exactly one result (rv, ) = reader.search_entries('word') assert list(rv.content) == [ '.content[2].value', '.content[3].value', '.summary', '.content[0].value', ]
def reader_without_and_with_entries(request, reader): if not request.param: return reader parser = Parser() reader._parser = parser feed = parser.feed(1, datetime(2010, 1, 1)) parser.entry( 1, 1, datetime(2010, 1, 1), title='feed one', summary='summary', content=[Content('content'), Content('another content')], ) parser.entry(1, 2, datetime(2010, 1, 1), title='feed one') parser.entry(1, 3, datetime(2010, 1, 1), title='feed one') parser.entry(1, 4, datetime(2010, 1, 1), title='feed one') parser.entry(1, 5, datetime(2010, 1, 1), title='feed one') reader.add_feed(feed.url) reader.update_feeds() return reader
def content(self): rv = [] for path, highlight in self._search_result.content.items(): # TODO: find a more correct way to match .content[0].value if path.startswith('.content[') and path.endswith('].value'): rv.append(Content(str(highlight), 'text/plain')) rv.append(Content(highlighted(highlight), 'text/html')) return rv
def test_search_entries_order_weights(reader, chunk_size): """Entry title beats feed title beats entry content/summary.""" # TODO: may need fixing once we finish tuning the weights (it should fail) reader._search.storage.chunk_size = chunk_size parser = Parser() reader._parser = parser feed_one = parser.feed(1, datetime(2010, 1, 1), title='one') entry_one = parser.entry(1, 1, datetime(2010, 1, 1)) feed_two = parser.feed(2, datetime(2010, 1, 1), title='two') entry_two = parser.entry(2, 2, datetime(2010, 1, 1), title='one') entry_three = parser.entry(2, 3, datetime(2010, 1, 1), content=[Content('one')]) entry_four = parser.entry(2, 4, datetime(2010, 1, 1), summary='one') entry_five = parser.entry(2, 5, datetime(2010, 1, 1), content=[Content('one')] * 2) entry_six = parser.entry(2, 6, datetime(2010, 1, 1), summary='one', content=[Content('one')]) entry_seven = parser.entry(2, 7, datetime(2010, 1, 1), title="does not match") reader.add_feed(feed_one.url) reader.add_feed(feed_two.url) reader.update_feeds() reader.enable_search() reader.update_search() rv = [(e.id, e.feed_url) for e in reader.search_entries('one')] assert rv[:2] == [(entry_two.id, feed_two.url), (entry_one.id, feed_one.url)] # TODO: how do we check these have the same exact rank? assert sorted(rv[2:]) == [ (entry_three.id, feed_two.url), (entry_four.id, feed_two.url), (entry_five.id, feed_two.url), (entry_six.id, feed_two.url), ]
def test_update_search_concurrent_calls(db_path, monkeypatch): """Test concurrent calls to reader.update_search() don't interfere with one another. https://github.com/lemon24/reader/issues/175#issuecomment-652489019 """ # This is a very intrusive test, maybe we should move it somewhere else. reader = make_reader(db_path) parser = reader._parser = Parser() feed = parser.feed(1, datetime(2010, 1, 1), title='feed') parser.entry( 1, 1, datetime(2010, 1, 1), title='entry', summary='summary', content=[Content('content')], ) reader.add_feed(feed.url) reader.update_feeds() reader.enable_search() barrier = threading.Barrier(2) def target(): from reader._search import Search class MySearch(Search): @staticmethod def strip_html(*args, **kwargs): barrier.wait() return Search.strip_html(*args, **kwargs) # TODO: remove monkeypatching when make_reader() gets a search_cls argument monkeypatch.setattr('reader.core.Search', MySearch) reader = make_reader(db_path) reader.update_search() threads = [threading.Thread(target=target) for _ in range(2)] for thread in threads: thread.start() for thread in threads: thread.join() (result, ) = reader.search_entries('entry') assert len(result.content) == 2 ((rowcount, ), ) = reader._search.db.execute("select count(*) from entries_search;") assert rowcount == 2
def test_search_entries_order_content_recent(reader): """When sort='recent' is used, the .content of any individual result should still be sorted by relevance. """ parser = Parser() reader._parser = parser feed = parser.feed(1, datetime(2010, 1, 1)) one = parser.entry( 1, 1, datetime(2010, 1, 1), title='word', content=[ Content('word word'), Content('word'), Content('word word word') ], ) two = parser.entry(1, 2, datetime(2010, 1, 2), summary='word') reader.add_feed(feed.url) reader.update_feeds() reader.enable_search() reader.update_search() # sanity check, one is more relevant assert [e.id for e in reader.search_entries('word')] == ['1, 1', '1, 2'] results = list(reader.search_entries('word', sort='recent')) # two is first because of updated assert [e.id for e in results] == ['1, 2', '1, 1'] # but within 1, the content keys are sorted by relevance; assert list(results[1].content) == [ '.content[2].value', '.content[0].value', '.content[1].value', ]
def test_search_entries_order_title_content_beats_title(reader): parser = Parser() reader._parser = parser feed = parser.feed(1, datetime(2010, 1, 1)) one = parser.entry(1, 1, datetime(2010, 1, 1), title='one') two = parser.entry(1, 2, datetime(2010, 1, 1), title='two') three = parser.entry(1, 3, datetime(2010, 1, 1), title='one', content=[Content('one')]) reader.add_feed(feed.url) reader.update_feeds() reader.enable_search() reader.update_search() assert [(e.id, e.feed_url) for e in reader.search_entries('one')] == [ (three.id, feed.url), (one.id, feed.url), ]
import datetime from reader import Content from reader import Enclosure from reader._types import EntryData from reader._types import FeedData feed = FeedData(url='{}relative.rss'.format(url_base), link='{}file.html'.format(rel_base)) entries = [ EntryData( feed_url=feed.url, id='7bd204c6-1655-4c27-aeee-53f933c5395f', updated=None, link='{}blog/post/1'.format(rel_base), summary='one <a href="{}target">two</a> three'.format(rel_base), content=( Content(value='<script>evil</script> content', type='text/plain', language=None), Content(value='content', type='text/html', language=None), ), enclosures=( # for RSS feedparser doesn't make relative links absolute # (it does for Atom) Enclosure(href='enclosure?q=a#fragment'), ), ) ]
def make_entry(title=None, summary=None, content=None): entry = Entry('id', None, title=title, summary=summary) if content: entry = entry._replace(content=[Content(*content)]) return entry
author='John Doe', ) entries = [ EntryData( feed_url=feed.url, id='urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a', updated=datetime.datetime(2003, 12, 13, 18, 30, 2), title='Atom-Powered Robots Run Amok', link='http://example.org/2003/12/13/atom03', author='John Doe', published=datetime.datetime(2003, 12, 13, 17, 17, 51), summary='Some text.', content=( # the text/plain type comes from feedparser Content(value='content', type='text/plain'), Content(value='content with type', type='text/whatever'), Content(value='content with lang', type='text/plain', language='en'), ), enclosures=( # the text/html type comes from feedparser Enclosure(href='http://example.org/enclosure', type='text/html'), Enclosure(href='http://example.org/enclosure-with-type', type='text/whatever'), Enclosure( href='http://example.org/enclosure-with-length', type='text/html', length=1000, ),
import datetime from reader import Content from reader import Enclosure from reader._types import EntryData from reader._types import FeedData feed = FeedData( url='{}empty.json'.format(url_base), ) entries = [ EntryData( feed_url=feed.url, id='1', updated=None, content=( Content( value='content', type='text/plain', ), ), ), ]
def test_search_entries_basic(reader, sort): parser = Parser() reader._parser = parser feed = parser.feed(1, datetime(2010, 1, 1)) one = parser.entry(1, 1, datetime(2010, 1, 1), title='one') two = parser.entry(1, 2, datetime(2010, 1, 1), title='two', summary='summary') three = parser.entry( 1, 3, datetime(2010, 1, 1), title='shall not be named', summary='does not match', # The emoji is to catch a bug in the json_extract() SQLite function. # As of reader 1.4 we're not using it anymore, and the workaround # was removed; we keep the emoji in case of regressions. # Bug: https://bugs.python.org/issue38749 # Workaround and more details: https://github.com/lemon24/reader/blob/d4363f683fc18ca12f597809ceca4e7dbd0a303a/src/reader/_sqlite_utils.py#L332 content=[Content('three 🤩 content')], ) reader.add_feed(feed.url) reader.update_feeds() reader.enable_search() assert list(reader.search_entries('one')) == [] reader.update_search() search = lambda *a, **kw: reader.search_entries(*a, sort=sort, **kw) search_counts = lambda *a, **kw: reader.search_entry_counts(*a, **kw) # TODO: the asserts below look parametrizable assert list(search('zero')) == [] assert search_counts('zero') == EntrySearchCounts(0, 0, 0, 0) assert list(search('one')) == [ EntrySearchResult( feed.url, one.id, { '.title': HighlightedString(one.title, (slice(0, 3), )), '.feed.title': HighlightedString(feed.title), }, ) ] assert search_counts('one') == EntrySearchCounts(1, 0, 0, 0) assert list(search('two')) == [ EntrySearchResult( feed.url, two.id, { '.title': HighlightedString(two.title, (slice(0, 3), )), '.feed.title': HighlightedString(feed.title), }, {'.summary': HighlightedString('summary')}, ) ] assert list(search('three')) == [ EntrySearchResult( feed.url, three.id, { '.title': HighlightedString(three.title), '.feed.title': HighlightedString(feed.title), }, { '.content[0].value': HighlightedString(three.content[0].value, (slice(0, 5), )) }, ) ] # TODO: fix inconsistent naming feed_two = parser.feed(2, datetime(2010, 1, 2)) feed_two_entry = parser.entry(2, 1, datetime(2010, 1, 2), title=None) feed_three = parser.feed(3, datetime(2010, 1, 1), title=None) feed_three_entry = parser.entry(3, 1, datetime(2010, 1, 1), title='entry summary') reader.add_feed(feed_two.url) reader.add_feed(feed_three) reader.set_feed_user_title(feed_two, 'a summary of things') reader.update_feeds() feed_two_entry = reader.get_entry((feed_two.url, feed_two_entry.id)) reader.update_search() # We can't use a set here because the dicts in EntrySearchResult aren't hashable. assert { (e.feed_url, e.id): e for e in search('summary') } == { (e.feed_url, e.id): e for e in [ EntrySearchResult( feed_three.url, feed_three_entry.id, { '.title': HighlightedString(feed_three_entry.title, (slice(6, 13), )) }, ), EntrySearchResult( feed_two.url, feed_two_entry.id, { '.feed.user_title': HighlightedString(feed_two_entry.feed.user_title, ( slice(2, 9), )) }, ), EntrySearchResult( feed.url, two.id, { '.title': HighlightedString(two.title), '.feed.title': HighlightedString(feed.title), }, {'.summary': HighlightedString(two.summary, (slice(0, 7), ))}, ), ] } assert search_counts('summary') == EntrySearchCounts(3, 0, 0, 0)
def test_update_triggers_no_change(db_path, monkeypatch, set_user_title): """update_search() should *not* update the search index if anything else except the indexed fields changes. """ from reader._search import Search strip_html_called = 0 class MySearch(Search): @staticmethod def strip_html(*args, **kwargs): nonlocal strip_html_called strip_html_called += 1 return Search.strip_html(*args, **kwargs) # TODO: remove monkeypatching when make_reader() gets a search_cls argument monkeypatch.setattr('reader.core.Search', MySearch) reader = make_reader(db_path) reader._parser = parser = Parser() reader._parser = parser = Parser() feed = parser.feed(1, datetime(2010, 1, 1), title='feed') entry = parser.entry( 1, 1, datetime(2010, 1, 1), title='entry', summary='summary', content=[Content('content')], ) reader.add_feed(feed.url) reader.update_feeds() if set_user_title: reader.set_feed_user_title(feed, 'user title') reader.enable_search() reader.update_search() assert strip_html_called > 0 strip_html_called = 0 (old_result, ) = reader.search_entries('entry OR feed') feed = parser.feed(1, datetime(2010, 1, 2), title='feed', link='link', author='author') """ entry = parser.entry( 1, 1, datetime(2010, 1, 2), title='entry', summary='summary', content=[Content('content')], link='link', author='author', published=datetime(2010, 1, 2), enclosures=[Enclosure('enclosure')], ) """ # NOTE: As of 1.4, updating entries normally (above) uses INSERT OR REPLACE. # REPLACE == DELETE + INSERT (https://www.sqlite.org/lang_conflict.html), # so updating the entry normally *will not* fire the ON UPDATE trigger, # but the ON DELETE and ON INSERT ones (basically, the ON UPDATE trigger # never fires at the moment). # # Meanwhile, we do a (more intrusive/brittle) manual update: with reader._search.db as db: db.execute(""" UPDATE entries SET ( title, link, updated, author, published, summary, content, enclosures ) = ( 'entry', 'http://www.example.com/entries/1', '2010-01-02 00:00:00', 'author', '2010-01-02 00:00:00', 'summary', '[{"value": "content", "type": null, "language": null}]', '[{"href": "enclosure", "type": null, "length": null}]' ) WHERE (id, feed) = ('1, 1', '1'); """) # TODO: Change this test when updating entries uses UPDATE instead of INSERT OR REPLACE reader.mark_as_read(entry) reader.mark_as_important(entry) reader.update_feeds() if set_user_title: reader.set_feed_user_title(feed, 'user title') reader.update_search() (new_result, ) = reader.search_entries('entry OR feed') assert old_result == new_result assert strip_html_called == 0
['.title', '.feed.title', '.summary'], ), ( lambda r: r._parser.entry( 1, 1, datetime(2010, 1, 3), summary='new'), ['.title', '.feed.title', '.summary'], ), ], "after update on entries: content": [ ( lambda r: r._parser.entry(1, 1, datetime(2010, 1, 1)), ['.title', '.feed.title'], ), ( lambda r: r._parser.entry( 1, 1, datetime(2010, 1, 2), content=[Content('old')]), ['.title', '.feed.title', '.content[0].value'], ), ( lambda r: r._parser.entry( 1, 1, datetime(2010, 1, 3), content=[Content('new')]), ['.title', '.feed.title', '.content[0].value'], ), ( lambda r: r._parser.entry( 1, 1, datetime(2010, 1, 4), content=[Content('new'), Content('another one')], ),
author='Example editor ([email protected])', ) entries = [ EntryData( feed_url=feed.url, id='7bd204c6-1655-4c27-aeee-53f933c5395f', updated=datetime.datetime(2009, 9, 6, 16, 20), title='Example entry', link='http://www.example.com/blog/post/1', author='Example editor', published=None, summary='Here is some text containing an interesting description.', content=( # the text/plain type comes from feedparser Content(value='Example content', type='text/plain'), ), enclosures=( Enclosure(href='http://example.com/enclosure'), Enclosure(href='http://example.com/enclosure-with-type', type='image/jpeg'), Enclosure(href='http://example.com/enclosure-with-length', length=100000), Enclosure(href='http://example.com/enclosure-with-bad-length'), ), ), EntryData( feed_url=feed.url, id='00000000-1655-4c27-aeee-00000000', updated=datetime.datetime(2009, 9, 6, 0, 0, 0), title='Example entry, again', ), ]