def test_delete(ctx): from melkman.db import RemoteFeed, NewsItem, NewsItemRef feed_url = 'http://example.org/feeds/1' dummy_feed = random_atom_feed(feed_url, 25) items = melk_ids_in(dummy_feed, feed_url) rf = RemoteFeed.create_from_url(feed_url, ctx) rf.update_from_feed(dummy_feed, 'test') rf.save() bucket_id = rf.id ref_ids = [] assert bucket_id in ctx.db for iid in items: assert iid in rf.entries ref_id = NewsItemRef.dbid(bucket_id, iid) ref_ids.append(ref_id) assert ref_id in ctx.db # a news item was created too... assert iid in ctx.db # now destroy! rf.delete() assert not bucket_id in ctx.db for ref_id in ref_ids: assert not ref_id in ctx.db for iid in items: assert not iid in ctx.db
def test_item_trace_update(ctx): from melkman.db import NewsItem, RemoteFeed def _check_item(item, info): for k, v in info.items(): if k != 'id': assert getattr(item, k) == v, "Key %s: Expected %s, got %s" % (k, v, getattr(item, k)) def check_item(f, iid, info): _check_item(f.entries[iid], info) _check_item(NewsItem.get(iid, ctx), info) feed_url = 'http://example.org/feed' feed = RemoteFeed.create_from_url(feed_url, ctx) atom_id = 'http://example.org/articles.php?id=1' time1 = no_micro(datetime.utcnow()) info1 = {'id': atom_id, 'title': 'Title1', 'author': 'author1', 'link': 'http://example.org/link1', 'summary': 'summary text 1', 'timestamp': time1} feed_v1 = make_atom_feed(feed_url, [make_atom_entry(**info1)]) feed.update_from_feed(feed_v1, method='test') feed.save() melk_id = melk_ids_in(feed_v1, feed_url)[0] check_item(feed, melk_id, info1) # change the info, but not the timestamp, should stay the same info2 = dict(info1) info2['title'] = 'Title 2' feed_v2 = make_atom_feed(feed_url, [make_atom_entry(**info2)]) feed.update_from_feed(feed_v2, method='test') feed.save() # should still match info1. (no update) check_item(feed, melk_id, info1) # now update the timestamp along with other fields time3 = no_micro(time1 + timedelta(seconds=1)) info3 = {'id': atom_id, 'title': 'Title3', 'author': 'author3', 'link': 'http://example.org/link3', 'summary': 'summary text 3', 'timestamp': time3} feed_v3 = make_atom_feed(feed_url, [make_atom_entry(**info3)]) feed.update_from_feed(feed_v3, method='test') feed.save() check_item(feed, melk_id, info3)
def test_disabled_unsubscribes(ctx): """ tests that if pubsub is disabled for a feed, it becomes unsubscribed from it's hub. """ from eventlet import sleep, spawn from melkman.db import RemoteFeed from melkman.fetch.pubsubhubbub import WSGISubClient, callback_url_for from melkman.fetch.pubsubhubbub import hubbub_sub, update_pubsub_state w = WSGISubClient(ctx) client = spawn(w.run) hub = FakeHub() hub_proc = spawn(hub.run) hub_url = 'http://localhost:%d/' % hub.port feed_url = 'http://example.org/feeds/99' rf = RemoteFeed.create_from_url(feed_url, ctx) rf.feed_info = {'links': [{'rel': 'self', 'href': feed_url}, {'rel': 'hub', 'href': hub_url}]} rf.save() # subscribe to the feed on the hub cb = callback_url_for(feed_url, ctx) assert not hub.is_verified(cb, feed_url) r, c = hubbub_sub(rf, ctx) assert r.status == 202, 'Expected 202, got %d' % r.status sleep(.5) assert hub.is_verified(cb, feed_url) # disable pubsub for the feed rf = RemoteFeed.get_by_url(feed_url, ctx) assert rf.hub_info.enabled == True and rf.hub_info.subscribed == True rf.hub_info.enabled = False rf.save() # trigger an update update_pubsub_state(rf, ctx) # check that it is now unsubscribed. sleep(.5) assert not hub.is_verified(cb, feed_url) rf = RemoteFeed.get_by_url(feed_url, ctx) assert rf.hub_info.enabled == False and rf.hub_info.subscribed == False client.kill() client.wait() hub_proc.kill() hub_proc.wait()
def index_feed_polling(url, context, timeout=15, request_info=None): """ poll the feed at the url given and index it immediately on the calling thread. """ if request_info is None: request_info = {} feed = RemoteFeed.get_by_url(url, context) if feed is None: feed = RemoteFeed.create_from_url(url, context) if check_request_approved(feed, request_info, context) == False: log.warn("Rejected index request for %s" % url) return reschedule = not request_info.get('skip_reschedule', False) http_cache = context.config.get('http', {}).get('cache', None) # fetch http = Http(cache=http_cache, timeout=timeout) http.force_exception_to_status_code = True response, content = http.request(url, 'GET') updated_docs = [] if response.fromcache: feed.record_update_info(success=True, updates=0, method=METHOD_POLL) elif response.status != 200: feed.record_update_info(success=False, updates=0, reason=response.reason, method=METHOD_POLL) else: # 200 status code, not from cache, do update... feed.update_from_feed(content, method=METHOD_POLL) # compute the next time to check... next_interval = compute_next_fetch_interval(feed.update_history) log.debug("next update interval for %s = %s" % (feed.url, next_interval)) feed.next_poll_time = datetime.utcnow() + next_interval feed.poll_in_progress = False feed.save() log.info("Updated feed %s success: %s, %d new items" % (feed.url, feed.update_history[0].success, feed.update_history[0].updates)) # whee... request at the next time ! if reschedule: message_id = 'periodic_index_%s' % RemoteFeed.id_for_url(feed.url) schedule_feed_index(feed.url, feed.next_poll_time, context, message_id=message_id) run_post_index_hooks(feed, context)
def test_max_history_len(ctx): from melkman.db.remotefeed import RemoteFeed, MAX_HISTORY feed_url = 'http://example.org/feeds/1' rf = RemoteFeed.create_from_url(feed_url, ctx) for i in range(5*MAX_HISTORY): reason = 'update %d' % i rf.record_update_info(reason=reason) if i < MAX_HISTORY: assert len(rf.update_history) == i + 1 else: assert len(rf.update_history) == MAX_HISTORY assert rf.update_history[0].reason == reason
def test_hub_lease_renew(ctx): """ tests that we resubscribe with a hub within the hub specified lease window. """ from eventlet import sleep, spawn from melkman.db import RemoteFeed from melkman.fetch.pubsubhubbub import WSGISubClient, callback_url_for from melkman.fetch.pubsubhubbub import hubbub_sub, update_pubsub_state w = WSGISubClient(ctx) client = spawn(w.run) # create a hub with a very short lease time. hub = FakeHub(lease_seconds=2) hub_proc = spawn(hub.run) hub_url = 'http://localhost:%d/' % hub.port feed_url = 'http://example.org/feeds/99' rf = RemoteFeed.create_from_url(feed_url, ctx) rf.feed_info = {'links': [{'rel': 'self', 'href': feed_url}, {'rel': 'hub', 'href': hub_url}]} rf.save() # subscribe to the feed on the hub cb = callback_url_for(feed_url, ctx) assert not hub.is_verified(cb, feed_url) r, c = hubbub_sub(rf, ctx) assert r.status == 202, 'Expected 202, got %d' % r.status sleep(.5) assert hub.is_verified(cb, feed_url) rf = RemoteFeed.get_by_url(feed_url, ctx) assert hub.renewals(cb, feed_url) == 0 sleep(2) update_pubsub_state(rf, ctx) # make sure we triggered a lease renewal assert hub.renewals(cb, feed_url) == 1 client.kill() client.wait() hub_proc.kill() hub_proc.wait()
def test_update_feed_partial_repeat(ctx): """ test that indexing the some of same content twice only updates new things. """ from melkman.db import RemoteFeed # create a document with a 10 entry feed feed_url = 'http://example.org/%s' % random_id() entries1 = dummy_atom_entries(10) entries2 = dummy_atom_entries(10) content = make_atom_feed(feed_url, entries1) # extract the ids from the document expect_ids = melk_ids_in(content, feed_url) assert len(expect_ids) == 10 feed = RemoteFeed.create_from_url(feed_url, ctx) # update a remote feed with the content. updates = feed.update_from_feed(content, method='test') feed.save() # make sure all the items come back as new/updated assert updates == 10 # make sure all the items are in the feed for iid in expect_ids: assert feed.has_news_item(iid) # add some additional entries content = make_atom_feed(feed_url, entries2 + entries1) expect_ids = set([x for x in melk_ids_in(content, feed_url) if not x in expect_ids]) assert len(expect_ids) == 10 updates = feed.update_from_feed(content, method='test') # this list should be ready to push to the db with no probs feed.save() assert updates == 10 # make sure all the items are in the feed for iid in expect_ids: assert feed.has_news_item(iid)
def index_feed_push(url, content, context, request_info=None): if request_info is None: request_info = {} feed = RemoteFeed.get_by_url(url, context) updated_docs = [] if feed is None: feed = RemoteFeed.create_from_url(url, context) if check_request_approved(feed, request_info, context) == False: log.warn("Rejected index request for %s" % url) return # 200 status code, not from cache, do update... feed.update_from_feed(content, method=METHOD_PUSH) feed.save() log.info("Updated feed %s success: %s, %d new items" % (feed.url, feed.update_history[0].success, feed.update_history[0].updates)) run_post_index_hooks(feed, context)
def test_update_feed_repeat_index(ctx): """ test that indexing the same content twice has no effect """ from melkman.db import RemoteFeed # create a document with a 10 entry feed feed_url = 'http://example.org/%s' % random_id() content = random_atom_feed(feed_url, 10) ids = melk_ids_in(content, feed_url) assert len(ids) == 10 feed = RemoteFeed.create_from_url(feed_url, ctx) # update a remote feed with the content. updates = feed.update_from_feed(content, method='test') feed.save() assert updates == 10 # make sure all the items are in the feed for iid in ids: assert feed.has_news_item(iid) # update again with identical content, # should have no effect. updates = feed.update_from_feed(content, method='test') assert updates == 0 feed.save() # reload... feed.reload() # update again with identical content, # should have no effect. updates = feed.update_from_feed(content, method='test') assert updates == 0
def test_hub_lease_renew_failover(ctx): """ tests that if we fail to renew a lease with a hub we will failover to a different hub if one is available. """ from eventlet import sleep, spawn from melkman.db import RemoteFeed from melkman.fetch.pubsubhubbub import WSGISubClient, callback_url_for from melkman.fetch.pubsubhubbub import hubbub_sub, update_pubsub_state w = WSGISubClient(ctx) client = spawn(w.run) # create a hub with a very short lease time. hub = FakeHub(lease_seconds=2) hub_proc = spawn(hub.run) hub_url = 'http://localhost:%d/' % hub.port hub2 = FakeHub(port=9298) hub2_proc = spawn(hub2.run) hub2_url = 'http://localhost:%d/' % hub2.port feed_url = 'http://example.org/feeds/99' rf = RemoteFeed.create_from_url(feed_url, ctx) rf.feed_info = {'links': [{'rel': 'self', 'href': feed_url}, {'rel': 'hub', 'href': hub_url}, {'rel': 'hub', 'href': hub2_url}]} rf.save() # subscribe to the feed on the hub cb = callback_url_for(feed_url, ctx) assert not hub.is_verified(cb, feed_url) r, c = hubbub_sub(rf, ctx) assert r.status == 202, 'Expected 202, got %d' % r.status sleep(.5) rf = RemoteFeed.get_by_url(feed_url, ctx) assert rf.hub_info.subscribed == True assert rf.hub_info.hub_url == hub_url assert hub.is_verified(cb, feed_url) assert not hub2.is_verified(cb, feed_url) assert hub.renewals(cb, feed_url) == 0 sleep(2) # kill the first hub so that when we update, # the renewal will fail... hub_proc.kill() # when this update is triggered, renewal should fail and # we should instead subscribe to the alternate hub. update_pubsub_state(rf, ctx) assert hub.renewals(cb, feed_url) == 0 rf = RemoteFeed.get_by_url(feed_url, ctx) assert rf.hub_info.subscribed == True assert rf.hub_info.hub_url == hub2_url assert hub2.is_verified(cb, feed_url) client.kill() client.wait() hub2_proc.kill() hub2_proc.wait()
def test_hub_invalidation_resub(ctx): """ tests that if a currently subscribed hub is no longer listed, we subscribe to a different hub if any are listed. """ from eventlet import sleep, spawn from melkman.db import RemoteFeed from melkman.fetch.pubsubhubbub import WSGISubClient, callback_url_for from melkman.fetch.pubsubhubbub import hubbub_sub, update_pubsub_state w = WSGISubClient(ctx) client = spawn(w.run) # create two hubs hub = FakeHub() hub_proc = spawn(hub.run) hub_url = 'http://localhost:%d/' % hub.port hub2 = FakeHub(port=9298) hub2_proc = spawn(hub2.run) hub2_url = 'http://localhost:%d/' % hub2.port feed_url = 'http://example.org/feeds/99' rf = RemoteFeed.create_from_url(feed_url, ctx) rf.feed_info = {'links': [{'rel': 'self', 'href': feed_url}, {'rel': 'hub', 'href': hub_url}]} rf.save() # subscribe to the feed on the hub cb = callback_url_for(feed_url, ctx) assert not hub.is_verified(cb, feed_url) r, c = hubbub_sub(rf, ctx) assert r.status == 202, 'Expected 202, got %d' % r.status sleep(.5) assert hub.is_verified(cb, feed_url) # remove the hub from the list of hubs, but replace it with another rf = RemoteFeed.get_by_url(feed_url, ctx) rf.feed_info = {'links': [{'rel': 'self', 'href': feed_url}, {'rel': 'hub', 'href': hub2_url}]} rf.save() # trigger an update update_pubsub_state(rf, ctx) # check that it is now unsubscribed from the original hub, and # is now subscribed to the new hub. sleep(2) assert not hub.is_verified(cb, feed_url) assert hub2.is_verified(cb, feed_url) rf = RemoteFeed.get_by_url(feed_url, ctx) assert rf.hub_info.enabled == True and rf.hub_info.subscribed == True client.kill() client.wait() hub_proc.kill() hub_proc.wait() hub2_proc.kill() hub2_proc.wait()
def test_push_index_digest(ctx): from melk.util.nonce import nonce_str from melkman.db.remotefeed import RemoteFeed from melkman.fetch import push_feed_index from melkman.fetch.worker import run_feed_indexer from eventlet import sleep, spawn from melkman.fetch.pubsubhubbub import psh_digest # start a feed indexer indexer = spawn(run_feed_indexer, ctx) url = 'http://www.example.com/feeds/2' rf = RemoteFeed.create_from_url(url, ctx) rf.hub_info.enabled = True rf.hub_info.subscribed = True rf.save() secret = nonce_str() content = random_atom_feed(url, 10) ids = melk_ids_in(content, url) correct_digest = 'sha1=%s' % psh_digest(content, secret) wrong_digest = 'wrong digest' # # no hub secret is specified on the feed # push_feed_index(url, content, ctx, digest=wrong_digest, from_hub=True) sleep(.5) rf = RemoteFeed.get_by_url(url, ctx) for iid in ids: assert iid not in rf.entries push_feed_index(url, content, ctx, digest=None, from_hub=True) sleep(.5) rf = RemoteFeed.get_by_url(url, ctx) for iid in ids: assert iid not in rf.entries # even the correct digest fails as no digest has been set push_feed_index(url, content, ctx, digest=correct_digest, from_hub=True) sleep(.5) rf = RemoteFeed.get_by_url(url, ctx) for iid in ids: assert iid not in rf.entries # # now set the hub secret # rf.hub_info.secret = secret rf.save() push_feed_index(url, content, ctx, digest=wrong_digest, from_hub=True) sleep(.5) rf = RemoteFeed.get_by_url(url, ctx) for iid in ids: assert iid not in rf.entries push_feed_index(url, content, ctx, digest=None, from_hub=True) sleep(.5) rf = RemoteFeed.get_by_url(url, ctx) for iid in ids: assert iid not in rf.entries # finally, the correct digest should work now... push_feed_index(url, content, ctx, digest=correct_digest, from_hub=True) sleep(.5) rf = RemoteFeed.get_by_url(url, ctx) for iid in ids: assert iid in rf.entries indexer.kill() indexer.wait()
def test_sub_to_hub(ctx): """ test make_sub_request and make_unsub_request """ from httplib2 import Http from eventlet import sleep, spawn from melk.util.nonce import nonce_str import traceback from melkman.db import RemoteFeed from melkman.fetch.worker import run_feed_indexer from melkman.fetch.pubsubhubbub import WSGISubClient from melkman.fetch.pubsubhubbub import callback_url_for from melkman.fetch.pubsubhubbub import hubbub_sub from melkman.fetch.pubsubhubbub import hubbub_unsub from melkman.fetch.pubsubhubbub import psh_digest import logging logging.basicConfig(level=logging.WARN) w = WSGISubClient(ctx) client = spawn(w.run) indexer = spawn(run_feed_indexer, ctx) hub = FakeHub() hub_proc = spawn(hub.run) hub_url = 'http://localhost:%d/' % hub.port feed_url = 'http://example.org/feeds/99' rf = RemoteFeed.create_from_url(feed_url, ctx) rf.feed_info = {'links': [{'rel': 'self', 'href': feed_url}, {'rel': 'hub', 'href': hub_url}]} rf.save() cb = callback_url_for(feed_url, ctx) # subscribe to the hub assert not hub.is_verified(cb, feed_url) r, c = hubbub_sub(rf, ctx) assert r.status == 202, 'Expected 202, got %d' % r.status sleep(.5) assert hub.is_verified(cb, feed_url) secret = hub.secret_for(cb, feed_url) http = Http() # simulate hub posting to callback URL content = random_atom_feed(feed_url, 10, link=feed_url, hub_urls=[hub_url]) digest = 'sha1=%s' % psh_digest(content, secret) r, c = http.request(cb, 'POST', body=content, headers={'X-Hub-Signature': digest}) assert r.status == 200, 'Expected 200, got %d' % r.status sleep(0.5) # since we are subscribed, new items should be in the feed now rf = RemoteFeed.get_by_url(feed_url, ctx) assert len(rf.entries) == 10 for iid in melk_ids_in(content, feed_url): assert iid in rf.entries # unsubscribe from hub r, c = hubbub_unsub(rf, ctx) assert r.status == 202, 'Expected 202, got %d' % r.status sleep(.5) assert not hub.is_verified(cb, feed_url) # simulate another POST to the callback URL # this time it should fail (we are not subscribed) content = random_atom_feed(feed_url, 10, link=feed_url, hub_urls=[hub_url]) digest = "sha1=%s" % psh_digest(content, secret) r, c = http.request(cb, 'POST', body=content, headers={'X-Hub-Signature': digest}) assert r.status == 200, 'Expected 200, got %d' % r.status sleep(0.5) # items should be the same as before (not subscribed) rf = RemoteFeed.get_by_url(feed_url, ctx) assert len(rf.entries) == 10 for iid in melk_ids_in(content, feed_url): assert not iid in rf.entries client.kill() client.wait() indexer.kill() indexer.wait() hub_proc.kill() hub_proc.wait()
def test_sub_push(ctx): from httplib2 import Http from eventlet import sleep, spawn from melk.util.nonce import nonce_str from melkman.db import RemoteFeed from melkman.fetch.worker import run_feed_indexer from melkman.fetch.pubsubhubbub import WSGISubClient, callback_url_for, psh_digest import logging logging.basicConfig(level=logging.WARN) w = WSGISubClient(ctx) client = spawn(w.run) indexer = spawn(run_feed_indexer, ctx) http = Http() url = 'http://example.org/feed/0' content = random_atom_feed(url, 10) secret = nonce_str() digest = 'sha1=%s' % psh_digest(content, secret) cb = callback_url_for(url, ctx) assert RemoteFeed.get_by_url(url, ctx) == None # try posting something that is not subscribed r, c = http.request(cb, 'POST', body=content, headers={'X-Hub-Signature': digest}) assert r.status == 200, 'Expected 200, got %d' % r.status sleep(1) # nothing should happen... assert RemoteFeed.get(url, ctx) == None # set up the feed, but don't subscribe rf = RemoteFeed.create_from_url(url, ctx) rf.save() r, c = http.request(cb, 'POST', body=content, headers={'X-Hub-Signature': digest}) assert r.status == 200, 'Expected 200, got %d' % r.status sleep(1) # nothing should happen... rf = RemoteFeed.get_by_url(url, ctx) assert len(rf.entries) == 0 # now set it up rf.hub_info.enabled = True rf.hub_info.subscribed = True rf.hub_info.secret = secret rf.save() # try with wrong digest... r, c = http.request(cb, 'POST', body=content, headers={'X-Hub-Signature': 'wrong'}) assert r.status == 200, 'Expected 200, got %d' % r.status sleep(0.5) # nothing should happen... rf = RemoteFeed.get_by_url(url, ctx) assert len(rf.entries) == 0 # try with no digest r, c = http.request(cb, 'POST', body=content) assert r.status == 200, 'Expected 200, got %d' % r.status sleep(0.5) # nothing should happen... rf = RemoteFeed.get_by_url(url, ctx) assert len(rf.entries) == 0 # finally, try with correct digest r, c = http.request(cb, 'POST', body=content, headers={'X-Hub-Signature': digest}) assert r.status == 200, 'Expected 200, got %d' % r.status sleep(0.5) # nothing should happen... rf = RemoteFeed.get_by_url(url, ctx) assert len(rf.entries) == 10 for iid in melk_ids_in(content, url): assert iid in rf.entries client.kill() client.wait() indexer.kill() indexer.wait()
def test_sub_verify(ctx): from httplib2 import Http from eventlet import spawn from melk.util.nonce import nonce_str from melkman.db import RemoteFeed from melkman.fetch.pubsubhubbub import WSGISubClient, callback_url_for import logging logging.basicConfig(level=logging.WARN) w = WSGISubClient(ctx) client = spawn(w.run) http = Http() url = 'http://example.org/feed/0' challenge = nonce_str() verify_token = nonce_str() secret = nonce_str() cb = callback_url_for(url, ctx) cb += '?hub.mode=subscribe' cb += '&hub.topic=%s' % url cb += '&hub.challenge=%s' % challenge cb += '&hub.verify_token=%s' % verify_token # try verifying something that doesn't exist r, c = http.request(cb, 'GET') assert r.status == 404, 'Expected 404, got %d' % r.status # now create it rf = RemoteFeed.create_from_url(url, ctx) rf.feed_info = {"links": [{"rel": "self", "href": url}]} rf.save() # still should not verify r, c = http.request(cb, 'GET') assert r.status == 404, 'Expected 404, got %d' % r.status # now set appropriate fields on the feed object rf.hub_info.enabled = True rf.hub_info.verify_token = verify_token rf.hub_info.secret = secret rf.save() # now it should accept verification... for i in range(3): r, c = http.request(cb, 'GET') assert r.status == 200, 'Expected 200, got %d' % r.status assert c == challenge, 'expected %s, got %s' % (challence, c) # create unsubscribe callback... cb = callback_url_for(url, ctx) cb += '?hub.mode=unsubscribe' cb += '&hub.topic=%s' % url cb += '&hub.challenge=%s' % challenge cb += '&hub.verify_token=%s' % verify_token # currently it should fail, we are not unsubscribed r, c = http.request(cb, 'GET') assert r.status == 404, 'Expected 404, got %d' % r.status # after disabling, the unsub verify should be okay rf.reload() rf.hub_info.enabled = False rf.save() r, c = http.request(cb, 'GET') assert r.status == 200, 'Expected 200, got %d' % r.status assert c == challenge, 'expected %s, got %s' % (challence, c) # now destroy the feed entirely, # unsub request for stuff that # does not exist should also # verify. del ctx.db[rf.id] r, c = http.request(cb, 'GET') assert r.status == 200, 'Expected 200, got %d' % r.status assert c == challenge, 'expected %s, got %s' % (challence, c) client.kill() client.wait()