def test_new_icon_found_domain_but_is_already_indexed( session, requests_mock, bg_client: TestAdapter[PickleMessage], mock_s3): icon_url = URL.from_string( f"http://{random_string()}.example.com/favicon.ico") image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url.to_string(), body=image_buff.read(), status=200, stream=True, ) requests_mock.start() upsert_url(session, icon_url) session.commit() event = NewIconFound(icon_url_uuid=icon_url.url_uuid) bg_client.send(PickleMessage.from_obj(event)) bg_client.send(PickleMessage.from_obj(event)) icon, domain_icon = (session.query( Icon, DomainIcon).join(DomainIcon).filter( DomainIcon.scheme == icon_url.scheme, DomainIcon.netloc == icon_url.netloc).one()) assert icon.source_blake2b_hash == hash_bytes assert domain_icon.scheme == icon_url.scheme assert domain_icon.netloc == icon_url.netloc
def test_new_icon_found_for_page_url_duplicated_by_content( session, requests_mock, bg_client: TestAdapter[PickleMessage], mock_s3): """Test that when a new page icon is found that is the same icon by hash as an existing icon, that it is recorded.""" page_url_1 = URL.from_string( f"http://{random_string()}.example.com/index.html") page_url_2 = page_url_1.follow("/otherindex.html") icon_url_1 = page_url_1.follow("favicon1.png") icon_url_2 = page_url_2.follow("favicon2.png") image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url_1.to_string(), body=image_buff.read(), status=200, stream=True, ) image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url_2.to_string(), body=image_buff.read(), status=200, stream=True, ) requests_mock.start() upsert_url(session, page_url_1) upsert_url(session, page_url_2) upsert_url(session, icon_url_1) upsert_url(session, icon_url_2) session.commit() event = NewIconFound(icon_url_uuid=icon_url_1.url_uuid, page_url_uuid=page_url_1.url_uuid) bg_client.send(PickleMessage.from_obj(event)) event = NewIconFound(icon_url_uuid=icon_url_2.url_uuid, page_url_uuid=page_url_2.url_uuid) bg_client.send(PickleMessage.from_obj(event)) url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join( SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter( SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all()) assert url_icon_obj_1.icon == url_icon_obj_2.icon assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes
def test_recrawl_of_hn_api(session, bg_client: TestAdapter[PickleMessage], mock_s3, requests_mock): url = random_url() upsert_url(session, url) session.commit() hn_id = random_numeric_id() api_url = get_hn_api_url(url) requests_mock.add( responses.GET, url=api_url.to_string(), json=make_algolia_resp( hits=[make_algolia_hit(objectID=hn_id, url=url.to_string())]), status=200, ) event = FetchDiscussionsCommand(url_uuid=url.url_uuid, source=DiscussionSource.HN) bg_client.send(PickleMessage.from_obj(event)) # And again, but with a different comment count requests_mock.remove(responses.GET, url=api_url.to_string()) requests_mock.add( responses.GET, url=api_url.to_string(), json=make_algolia_resp(hits=[ make_algolia_hit( objectID=hn_id, url=url.to_string(), num_comments=5, title="Other example", created_at_i=int(datetime(2018, 1, 4).timestamp()), ) ]), status=200, ) # and again bg_client.send(PickleMessage.from_obj(event)) discussion = (session.query(SQLDiscussion).filter( SQLDiscussion.discussion_source_id == DiscussionSource.HN.value ).filter(SQLDiscussion.external_discussion_id == str(hn_id)).one()) assert discussion.comment_count == 5 assert discussion.url_uuid == url.url_uuid assert discussion.title == "Other example" assert discussion.created_at == datetime(2018, 1, 4, tzinfo=timezone.utc)
def on_bookmark_created(message: PickleMessage, ctx: missive.HandlingContext): """When a new bookmark is created, we want to: - crawl it, if it's not yet crawled - (tbc) other things """ event = cast(BookmarkCreated, message.get_obj()) session = get_session(ctx) url = get_url_by_url_uuid(session, event.url_uuid) if url is None: raise RuntimeError("url requested to crawl does not exist in the db") if not is_crawled(session, url): publish_message( CrawlRequested(crawl_request=CrawlRequest( request=Request(verb=HTTPVerb.GET, url=url), reason=BookmarkCrawlReason(), )), environ["QM_RABBITMQ_BG_WORKER_TOPIC"], ) for source in DiscussionSource: publish_message( FetchDiscussionsCommand(url_uuid=url.url_uuid, source=source), environ["QM_RABBITMQ_BG_WORKER_TOPIC"], ) ctx.ack()
def test_crawl_hn_api_no_discussions(session, bg_client, requests_mock): url = random_url() upsert_url(session, url) session.commit() api_url = get_hn_api_url(url) requests_mock.add( responses.GET, url=api_url.to_string(), json=make_algolia_resp(hits=[]), status=200, ) event = FetchDiscussionsCommand(url_uuid=url.url_uuid, source=DiscussionSource.HN) bg_client.send(PickleMessage.from_obj(event)) discussion = (session.query(SQLDiscussion).filter( SQLDiscussion.discussion_source_id == DiscussionSource.HN.value). filter(SQLDiscussion.url_uuid == url.url_uuid).first()) assert discussion is None discussion_fetch = (session.query(SQLDiscussionFetch).filter( SQLDiscussionFetch.url_uuid == url.url_uuid).one()) assert discussion_fetch.discussion_source_id == DiscussionSource.HN.value assert discussion_fetch.status_code == 200 assert discussion_fetch.retrieved == datetime(2018, 1, 3, tzinfo=timezone.utc)
def test_crawl_hn_api(session, bg_client: TestAdapter[PickleMessage], mock_s3, requests_mock): url = random_url() upsert_url(session, url) session.commit() hn_id = random_numeric_id() api_url = get_hn_api_url(url) requests_mock.add( responses.GET, url=api_url.to_string(), json=make_algolia_resp( hits=[make_algolia_hit(objectID=hn_id, url=url.to_string())]), status=200, ) event = FetchDiscussionsCommand(url_uuid=url.url_uuid, source=DiscussionSource.HN) bg_client.send(PickleMessage.from_obj(event)) discussion = (session.query(SQLDiscussion).filter( SQLDiscussion.discussion_source_id == DiscussionSource.HN.value ).filter(SQLDiscussion.external_discussion_id == str(hn_id)).one()) assert discussion.external_discussion_id == str(hn_id) assert discussion.discussion_source_id == DiscussionSource.HN.value assert discussion.comment_count == 1 assert discussion.url_uuid == url.url_uuid
def test_new_icon_found_for_page_url_duplicated_by_url( session, bg_client: TestAdapter[PickleMessage], mock_s3, requests_mock): """Test that when a new page icon is found that is the same icon by hash as an existing icon, that it is recorded.""" page_url_1 = URL.from_string( f"http://{random_string()}.example.com/index.html") page_url_2 = page_url_1.follow("/otherindex.html") icon_url = page_url_1.follow("favicon1.png") hash_bytes = bytes(random.getrandbits(8) for _ in range(64)) upsert_url(session, page_url_1) upsert_url(session, page_url_2) upsert_url(session, icon_url) icon_uuid = uuid4() session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes)) session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid)) session.add(URLIcon(url_uuid=page_url_1.url_uuid, icon_uuid=icon_uuid)) session.commit() event = NewIconFound(icon_url_uuid=icon_url.url_uuid, page_url_uuid=page_url_2.url_uuid) bg_client.send(PickleMessage.from_obj(event)) url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join( SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter( SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all()) assert url_icon_obj_1.icon == url_icon_obj_2.icon assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes
def on_discussion_crawl_requested(message: PickleMessage, ctx: missive.HandlingContext): event = cast(FetchDiscussionsCommand, message.get_obj()) session = get_session(ctx) http_client = get_http_client(ctx) url = get_url_by_url_uuid(session, event.url_uuid) if url is None: # FIXME: improve this... raise RuntimeError("url does not exist!") log.info("fetching discussions for %s from %s", url, event.source) client: Union[discussion_clients.HNAlgoliaClient, discussion_clients.RedditDiscussionClient] if event.source == DiscussionSource.HN: client = discussion_clients.HNAlgoliaClient(http_client) else: client = get_reddit_client(ctx) try: upsert_discussions(session, client.discussions_for_url(url)) record_discussion_fetch(session, url, event.source) except discussion_clients.DiscussionAPIError as e: log.error( "got bad response (%s) from %s: %s", e.response_status(), e.source, e.response_text(), ) session.rollback() else: session.commit() ctx.ack()
def on_new_icon_found(message: PickleMessage, ctx: missive.HandlingContext): event = cast(NewIconFound, message.get_obj()) session = get_session(ctx) http_client = get_http_client(ctx) icon_url = get_url_by_url_uuid(session, event.icon_url_uuid) if icon_url is None: raise RuntimeError("icon url not in db") if event.page_url_uuid is not None: page_url = get_url_by_url_uuid(session, event.page_url_uuid) else: page_url = None existing_icon_uuid = icon_at_url(session, icon_url) if existing_icon_uuid is not None: log.info("already have icon at %s", icon_url) if page_url is not None: upsert_icon_for_url(session, page_url, existing_icon_uuid) else: blake2b_hash, response = crawler.crawl_icon( session, http_client, Request(verb=HTTPVerb.GET, url=icon_url)) body = cast(RewindingIO, response.body) with body as wind: indexing.index_icon(session, icon_url, wind, blake2b_hash, page_url=page_url) session.commit() ctx.ack()
def test_index_requested_new_page_and_known_page_icon_url( session, bg_worker, mock_s3, requests_mock): """Test that when a page uses an icon url we already have in the index, we reuse it.""" icon_url = URL.from_string( f"http://{random_string()}.example.com/favicon.png") icon_uuid = uuid4() html = f""" <html> <head> <link rel="icon" type="image/png" href="{icon_url.to_string()}"> </head> </html> """ sqla_url, crawl_req, crawl_resp = make_crawl_with_response( session, response_body=BytesIO(html.encode("utf-8"))) image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() upsert_url(session, icon_url) session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes)) session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid)) session.commit() bg_worker.send( PickleMessage.from_obj(IndexRequested(crawl_resp.crawl_uuid))) url_icon = (session.query(URLIcon).filter( URLIcon.url_uuid == sqla_url.url_uuid).one()) assert url_icon.url_uuid == sqla_url.url_uuid assert url_icon.icon_uuid == icon_uuid
def test_hello_event(bg_client: TestAdapter[PickleMessage], caplog): caplog.set_level(logging.INFO, logger="quarchive.bg_worker") event = HelloEvent("greetings earthling") bg_client.send(PickleMessage.from_obj(event)) logs = [r.getMessage() for r in caplog.records] expected = "greetings earthling" # FIXME: this is pretty ropey and fragile assert expected in logs[-1]
def print_hellos(message: PickleMessage, ctx: missive.HandlingContext): event: HelloEvent = cast(HelloEvent, message.get_obj()) time_taken_ms = (datetime.now(timezone.utc) - event.created).total_seconds() * 1000 log.info( "got hello event (in %.3fms), message: '%s'", round(time_taken_ms, 3), event.message, ) ctx.ack()
def on_index_requested(message: PickleMessage, ctx: missive.HandlingContext): event = cast(IndexRequested, message.get_obj()) session = get_session(ctx) metadata = indexing.index(session, event.crawl_uuid) if metadata: icon_message = icon_message_if_necessary(session, metadata) else: icon_message = None session.commit() ctx.ack() if icon_message: publish_message(icon_message, environ["QM_RABBITMQ_BG_WORKER_TOPIC"])
def on_bookmark_crawl_requested(message: PickleMessage, ctx: missive.HandlingContext): event = cast(CrawlRequested, message.get_obj()) session = get_session(ctx) http_client = get_http_client(ctx) crawl_result = crawler.crawl(session, http_client, event.crawl_request.request) session.commit() publish_message( IndexRequested(crawl_uuid=crawl_result.crawl_uuid), environ["QM_RABBITMQ_BG_WORKER_TOPIC"], ) ctx.ack()
def test_new_icon_found_for_page_icon(session, requests_mock, bg_client: TestAdapter[PickleMessage], mock_s3): """Test that when a new page icon is found (that doesn't match any existing icons) that it is retrieved, indexed and stored. """ url = URL.from_string(f"http://{random_string()}.example.com/") icon_url = url.follow("/favicon.png") image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url.to_string(), body=image_buff.read(), status=200, stream=True, ) requests_mock.start() upsert_url(session, url) upsert_url(session, icon_url) session.commit() event = NewIconFound(icon_url_uuid=icon_url.url_uuid, page_url_uuid=url.url_uuid) bg_client.send(PickleMessage.from_obj(event)) icon, url_icon = (session.query(Icon, URLIcon).join(URLIcon).filter( URLIcon.url_uuid == url.url_uuid).first()) assert icon.source_blake2b_hash == hash_bytes assert url_icon.url_uuid == url.url_uuid icon_bucket = file_storage.get_icon_bucket() (s3_obj, ) = list( icon_bucket.objects.filter(Prefix=f"{icon.icon_uuid}.png")) assert s3_obj.key == f"{icon.icon_uuid}.png" response = s3_obj.get() assert response["ResponseMetadata"]["HTTPHeaders"][ "content-type"] == "image/png"
def test_index_requested_new_page_and_new_page_icon(session, bg_worker, mock_s3, requests_mock): """Test that new pages are indexed properly and their icon is downloaded.""" icon_url = URL.from_string( f"http://{random_string()}.example.com/favicon.png") html = f""" <html> <head> <link rel="icon" type="image/png" href="{icon_url.to_string()}"> </head> </html> """ sqla_url, crawl_req, crawl_resp = make_crawl_with_response( session, response_body=BytesIO(html.encode("utf-8"))) session.commit() image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url.to_string(), body=image_buff.read(), status=200, stream=True, ) bg_worker.send( PickleMessage.from_obj(IndexRequested(crawl_resp.crawl_uuid))) fulltext_exists = session.query( session.query(FullText).filter( FullText.crawl_uuid == crawl_req.crawl_uuid).exists()).scalar() icon_exists = session.query( session.query(Icon).filter( Icon.source_blake2b_hash == hash_bytes).exists()).scalar() assert fulltext_exists, "crawl not indexed!" assert icon_exists, "icon not crawled!"
def test_crawl_requested(session, bg_worker, mock_s3, requests_mock): url = URL.from_string("http://example.com/" + random_string()) requests_mock.add( responses.GET, url=url.to_string(), body="Hello!", status=200, stream=True, ) bg_worker.send( PickleMessage.from_obj( CrawlRequested( CrawlRequest( request=Request(HTTPVerb.GET, url=url), reason=BookmarkCrawlReason(), )))) response_exists = session.query( session.query(SQLACrawlResponse).join(SQLACrawlRequest).join(SQLAUrl). filter(SQLAUrl.url_uuid == url.url_uuid).exists()).scalar() assert response_exists
def test_new_icon_found_domain(session, requests_mock, bg_client: TestAdapter[PickleMessage], mock_s3): icon_url = URL.from_string( f"http://{random_string()}.example.com/favicon.ico") image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url.to_string(), body=image_buff.read(), status=200, stream=True, ) requests_mock.start() upsert_url(session, icon_url) session.commit() event = NewIconFound(icon_url_uuid=icon_url.url_uuid) bg_client.send(PickleMessage.from_obj(event)) icon, domain_icon = (session.query( Icon, DomainIcon).join(DomainIcon).filter( DomainIcon.scheme == icon_url.scheme, DomainIcon.netloc == icon_url.netloc).first()) assert icon.source_blake2b_hash == hash_bytes assert domain_icon.scheme == icon_url.scheme assert domain_icon.netloc == icon_url.netloc icon_bucket = file_storage.get_icon_bucket() (s3_obj, ) = list( icon_bucket.objects.filter(Prefix=f"{icon.icon_uuid}.png")) assert s3_obj.key == f"{icon.icon_uuid}.png" response = s3_obj.get() assert response["ResponseMetadata"]["HTTPHeaders"][ "content-type"] == "image/png"
def test_bookmark_created(session, bg_worker, mock_s3, requests_mock, test_user): url = URL.from_string("http://example.com/" + random_string()) upsert_url(session, url) session.commit() requests_mock.add( responses.GET, url=url.to_string(), body="Hello!", status=200, stream=True, ) bg_worker.send( PickleMessage.from_obj( BookmarkCreated(user_uuid=test_user.user_uuid, url_uuid=url.url_uuid))) response_exists = session.query( session.query(SQLACrawlResponse).join(SQLACrawlRequest).join(SQLAUrl). filter(SQLAUrl.url_uuid == url.url_uuid).exists()).scalar() assert response_exists
def test_multi_page_hn_api(session, bg_client: TestAdapter[PickleMessage], mock_s3, requests_mock): url = random_url() upsert_url(session, url) session.commit() api_url1 = get_hn_api_url(url) api_url2 = URL.from_string(api_url1.to_string() + "&page=1") requests_mock.add( responses.GET, url=api_url1.to_string(), json=make_algolia_resp(nbPages=2, hitsPerPage=1, hits=[make_algolia_hit(url=url.to_string())]), status=200, ) requests_mock.add( responses.GET, url=api_url2.to_string(), json=make_algolia_resp( nbPages=2, page=1, hitsPerPage=1, hits=[make_algolia_hit(url=url.to_string())], ), status=200, ) event = FetchDiscussionsCommand(url_uuid=url.url_uuid, source=DiscussionSource.HN) bg_client.send(PickleMessage.from_obj(event)) discussion_count = (session.query(SQLDiscussion).filter( SQLDiscussion.discussion_source_id == DiscussionSource.HN.value). filter(SQLDiscussion.url_uuid == url.url_uuid).count()) assert discussion_count == 2
def __call__(self, message: PickleMessage) -> bool: return isinstance(message.get_obj(), self.required_class)