def test_user_netloc_page(signed_in_client, test_user): epoch_start = datetime(1970, 1, 1, tzinfo=timezone.utc) bm1 = make_bookmark( url=URL.from_string("http://pokemon.example.com/"), title="Pokemon", tag_triples=frozenset([("pokemon", epoch_start, False)]), ) bm2 = make_bookmark( url=URL.from_string("http://digimon.example.com/"), title="Digimon", tag_triples=frozenset([("digimon", epoch_start, False)]), ) sync_bookmarks(signed_in_client, test_user, [bm1, bm2]) response = signed_in_client.get( flask.url_for( "quarchive.user_netloc", username=test_user.username, netloc="pokemon.example.com", ) ) assert response.status_code == 200 (present,) = get_bookmarks_from_response(response) assert present["url"] == "http://pokemon.example.com/" assert present["title"] == "Pokemon"
def test_most_recent_successful_crawls(session, cache, test_user): # Crawl 1 is an old crawl of url 1 (should not be present) # Crawl 2 is a more recent crawl of url 1 # Crawl 3 is a crawl of url 3 that didn't get a response # Crawl 4 is a crawl that of url 4 returned a non-2xx status code # Only crawl 2 should be present url_1 = SQLAUrl.from_url( URL.from_string(f"http://example.com/{random_string()}")) bm_1 = make_bookmark(url=url_1.to_url()) set_bookmark(session, cache, test_user.user_uuid, bm_1) crawl_req_1 = CrawlRequest( # type: ignore crawl_uuid=uuid4(), requested=datetime(2018, 1, 3), got_response=True, url_uuid=url_1.url_uuid, response_obj=CrawlResponse(body_uuid=uuid4(), headers={}, status_code=200), ) crawl_req_2 = CrawlRequest( # type: ignore crawl_uuid=uuid4(), requested=datetime(2018, 1, 4), got_response=True, url_uuid=url_1.url_uuid, response_obj=CrawlResponse(body_uuid=uuid4(), headers={}, status_code=200), ) url_3 = SQLAUrl.from_url( URL.from_string(f"http://example.com/{random_string()}")) bm_3 = make_bookmark(url=url_3.to_url()) set_bookmark(session, cache, test_user.user_uuid, bm_3) crawl_req_3 = CrawlRequest( crawl_uuid=uuid4(), requested=datetime(2018, 1, 3), got_response=False, url_uuid=url_3.url_uuid, ) url_4 = SQLAUrl.from_url( URL.from_string(f"http://example.com/{random_string()}")) bm_4 = make_bookmark(url=url_3.to_url()) set_bookmark(session, cache, test_user.user_uuid, bm_4) crawl_req_4 = CrawlRequest( # type: ignore crawl_uuid=uuid4(), requested=datetime(2018, 1, 3), got_response=False, url_uuid=url_3.url_uuid, response_obj=CrawlResponse(body_uuid=uuid4(), headers={}, status_code=404), ) session.add_all([crawl_req_1, crawl_req_2, crawl_req_3, crawl_req_4]) session.commit() rv = set(most_recent_successful_bookmark_crawls(session)) assert crawl_req_1.crawl_uuid not in rv assert crawl_req_2.crawl_uuid in rv assert crawl_req_3.crawl_uuid not in rv assert crawl_req_3.crawl_uuid not in rv
def test_tags_page(signed_in_client, test_user): # FIXME: include deleted, etc epoch_start = datetime(1970, 1, 1, tzinfo=timezone.utc) bm1 = make_bookmark( url=URL.from_string("http://example.com/pokemon"), title="Pokemon", tag_triples=frozenset([("pokemon", epoch_start, False)]), ) bm2 = make_bookmark( url=URL.from_string("http://example.com/digimon"), title="Digimon", tag_triples=frozenset([("digimon", epoch_start, False)]), ) sync_bookmarks(signed_in_client, test_user, [bm1, bm2]) response = signed_in_client.get( flask.url_for("quarchive.user_tags", username=test_user.username)) assert response.status_code == 200 html_parser = etree.HTMLParser() root = etree.fromstring(response.get_data(), html_parser) tags = set([e.text for e in CSSSelector(".tag-link")(root)]) assert {"pokemon", "digimon"} == tags
def url(self) -> URL: if self.discussion.source == DiscussionSource.HN: return URL.from_string( f"https://news.ycombinator.com/item?id={self.discussion.external_id}" ) else: return URL.from_string( f"https://old.reddit.com/{self.discussion.external_id}")
def test_crawl_when_response_is_recieved(session, http_client, status_code, mock_s3, requests_mock): url = URL.from_string("http://example.com/" + random_string()) upsert_url(session, url) requests_mock.add(responses.GET, url.to_string(), body=b"hello", status=status_code, stream=True) request = Request(verb=HTTPVerb.GET, url=url) response = crawler.crawl(session, http_client, request) sql_request = session.query(SQLACrawlRequest).get(response.crawl_uuid) sql_response = session.query(CrawlResponse).get(response.crawl_uuid) assert sql_request.requested == datetime(2018, 1, 3, tzinfo=timezone.utc) assert sql_request.got_response assert sql_response.status_code == status_code assert sql_response.crawl_uuid == sql_response.crawl_uuid assert sql_response.headers == {"content-type": "text/plain"} s3_obj = (file_storage.get_s3().Object( environ["QM_RESPONSE_BODY_BUCKET_NAME"], str(sql_response.body_uuid)).get()) response_body = s3_obj["Body"].read() assert response_body == gzip.compress(b"hello")
def test_from_string(): url = URL.from_string("http://example.com/a?b=c#d") assert url.scheme == "http" assert url.netloc == "example.com" assert url.path == "/a" assert url.query == "b=c" assert url.fragment == "d"
def best_icon(metadata: HTMLMetadata) -> Icon: """Will return the most suitable icon for our purposes, falling back to the domain level favicon.ico if nothing else is available.""" # We don't currently consider SVG as we can't read them (yet) possible_icons = list( sorted( (i for i in metadata.icons if i.mimetype() != "image/svg+xml"), key=lambda i: (i.size_rank(), i.mimetype_rank()), reverse=True, )) if len(possible_icons) > 0: best_icon = possible_icons[0] if best_icon.size_rank() > 0: # If the size is wonky, skip it log.debug("picked %s as icon for %s", best_icon, metadata.url) return best_icon url = metadata.url fallback_icon = Icon( url=URL.from_string(f"{url.scheme}://{url.netloc}/favicon.ico"), rel_text="shortcut icon", scope=IconScope.DOMAIN, ) log.debug("no icons found on %s, falling back favicon.ico", metadata.url) return fallback_icon
def test_new_icon_found_for_page_url_duplicated_by_url( session, bg_client: TestAdapter[PickleMessage], mock_s3, requests_mock): """Test that when a new page icon is found that is the same icon by hash as an existing icon, that it is recorded.""" page_url_1 = URL.from_string( f"http://{random_string()}.example.com/index.html") page_url_2 = page_url_1.follow("/otherindex.html") icon_url = page_url_1.follow("favicon1.png") hash_bytes = bytes(random.getrandbits(8) for _ in range(64)) upsert_url(session, page_url_1) upsert_url(session, page_url_2) upsert_url(session, icon_url) icon_uuid = uuid4() session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes)) session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid)) session.add(URLIcon(url_uuid=page_url_1.url_uuid, icon_uuid=icon_uuid)) session.commit() event = NewIconFound(icon_url_uuid=icon_url.url_uuid, page_url_uuid=page_url_2.url_uuid) bg_client.send(PickleMessage.from_obj(event)) url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join( SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter( SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all()) assert url_icon_obj_1.icon == url_icon_obj_2.icon assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes
def test_new_icon_found_domain_but_is_already_indexed( session, requests_mock, bg_client: TestAdapter[PickleMessage], mock_s3): icon_url = URL.from_string( f"http://{random_string()}.example.com/favicon.ico") image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url.to_string(), body=image_buff.read(), status=200, stream=True, ) requests_mock.start() upsert_url(session, icon_url) session.commit() event = NewIconFound(icon_url_uuid=icon_url.url_uuid) bg_client.send(PickleMessage.from_obj(event)) bg_client.send(PickleMessage.from_obj(event)) icon, domain_icon = (session.query( Icon, DomainIcon).join(DomainIcon).filter( DomainIcon.scheme == icon_url.scheme, DomainIcon.netloc == icon_url.netloc).one()) assert icon.source_blake2b_hash == hash_bytes assert domain_icon.scheme == icon_url.scheme assert domain_icon.netloc == icon_url.netloc
def test_index_requested_new_page_and_known_page_icon_url( session, bg_worker, mock_s3, requests_mock): """Test that when a page uses an icon url we already have in the index, we reuse it.""" icon_url = URL.from_string( f"http://{random_string()}.example.com/favicon.png") icon_uuid = uuid4() html = f""" <html> <head> <link rel="icon" type="image/png" href="{icon_url.to_string()}"> </head> </html> """ sqla_url, crawl_req, crawl_resp = make_crawl_with_response( session, response_body=BytesIO(html.encode("utf-8"))) image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() upsert_url(session, icon_url) session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes)) session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid)) session.commit() bg_worker.send( PickleMessage.from_obj(IndexRequested(crawl_resp.crawl_uuid))) url_icon = (session.query(URLIcon).filter( URLIcon.url_uuid == sqla_url.url_uuid).one()) assert url_icon.url_uuid == sqla_url.url_uuid assert url_icon.icon_uuid == icon_uuid
def reindex_url(url: str, log_level: str): url_obj = URL.from_string(url) Session = get_session_cls() with contextlib.closing(Session()) as session: crawl_uuid = get_most_recent_crawl(session, url_obj) publish_message(IndexRequested(crawl_uuid), environ["QM_RABBITMQ_BG_WORKER_TOPIC"]) log.info("requested index of %s (crawl_uuid: %s)", url_obj, crawl_uuid)
def test_calpaterson(): url = URL.from_string("http://calpaterson.com/calpaterson.html") with open(path.join(test_data_path, "calpaterson.html"), "rb") as html_f: metadata = extract_metadata_from_html(url, html_f) words = WORDS_REGEX.findall(metadata.text) # type: ignore # pass/fail assert len(words) > 0
def get_bookmark_by_url(session: Session, user_uuid: UUID, url_string: str) -> Optional[Bookmark]: url = URL.from_string(url_string) sqla_bookmark = (session.query(SQLABookmark).filter( SQLABookmark.user_uuid == user_uuid, SQLABookmark.url_uuid == url.url_uuid).first()) if sqla_bookmark is None: return None return bookmark_from_sqla(url, sqla_bookmark)
def _discussion_from_child_data(self, child_data: Mapping) -> Discussion: return Discussion( external_id=child_data["id"], source=DiscussionSource.REDDIT, url=URL.from_string(child_data["url"], coerce_canonicalisation=True), comment_count=child_data["num_comments"], created_at=datetime.utcfromtimestamp(child_data["created_utc"]), title=f'{child_data["subreddit_name_prefixed"]}: {child_data["title"]}', )
def get_archive_links(url: URL, circa: Optional[datetime] = None ) -> Mapping[Archive, URL]: if circa is None: circa = datetime.utcnow().replace(tzinfo=timezone.utc) # This is the internet archive's timestamp format, which archive_today # helpfully also supports ia_timestamp = circa.strftime("%Y%m%d%H%M%S") links = {} links[Archive.WAYBACK] = URL.from_string( f"https://web.archive.org/web/{ia_timestamp}/{url.to_string()}") links[Archive.ARCHIVE_TODAY] = URL.from_string( f"https://archive.today/{ia_timestamp}/{url.to_string()}") links[Archive.GOOGLE_CACHE] = URL.from_string( f"https://webcache.googleusercontent.com/search?q=cache:{url.to_string()}" ) return links
def extract_hn_discussions(response_body: Mapping) -> Iterator[Discussion]: log.debug("hn search api returned: %s", response_body) for hit in response_body["hits"]: yield Discussion( comment_count=hit.get("num_comments", 0) or 0, created_at=datetime.utcfromtimestamp(hit["created_at_i"]), external_id=hit["objectID"], title=hit.get("title", ""), url=URL.from_string(hit["url"], coerce_canonicalisation=True), source=DiscussionSource.HN, )
def test_upsert_metadata_update(session, mock_s3): """Test upsert metadata with an new crawl where things have changed""" url, crawl_req, _ = make_crawl_with_response(session) link_1 = URL.from_string("http://example.com/" + random_string() + "/more") link_2 = URL.from_string("http://example.com/" + random_string() + "/even-more") canon_1 = URL.from_string("http://example.com/" + random_string() + "/index") metadata_1 = HTMLMetadata( url=url.to_url(), # icons=[], # FIXME: try a page-level icon # title="Example page", # meta_desc="An example page", links={link_1, link_2}, canonical=canon_1, ) upsert_metadata(session, crawl_req.crawl_uuid, metadata_1) url, crawl_req, _ = make_crawl_with_response(session, url.to_url()) link_3 = URL.from_string("http://example.com/" + random_string() + "/yet-more") canon_2 = URL.from_string("http://example.com/" + random_string() + "/index2") metadata_2 = HTMLMetadata( url=url.to_url(), # icons=[], # FIXME: try a page-level icon # title="Example page", # meta_desc="An example page", canonical=canon_2, links={link_1, link_3}, ) upsert_metadata(session, crawl_req.crawl_uuid, metadata_2) sqla_url_obj = session.query(SQLAUrl).filter( SQLAUrl.url_uuid == url.url_uuid).one() link_urls = {o.to_url_obj.to_url() for o in sqla_url_obj.links} assert link_urls == {link_1, link_3} assert sqla_url_obj.canonical_url_obj.canonical_url_uuid == canon_2.url_uuid
def test_new_icon_found_for_page_url_duplicated_by_content( session, requests_mock, bg_client: TestAdapter[PickleMessage], mock_s3): """Test that when a new page icon is found that is the same icon by hash as an existing icon, that it is recorded.""" page_url_1 = URL.from_string( f"http://{random_string()}.example.com/index.html") page_url_2 = page_url_1.follow("/otherindex.html") icon_url_1 = page_url_1.follow("favicon1.png") icon_url_2 = page_url_2.follow("favicon2.png") image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url_1.to_string(), body=image_buff.read(), status=200, stream=True, ) image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url_2.to_string(), body=image_buff.read(), status=200, stream=True, ) requests_mock.start() upsert_url(session, page_url_1) upsert_url(session, page_url_2) upsert_url(session, icon_url_1) upsert_url(session, icon_url_2) session.commit() event = NewIconFound(icon_url_uuid=icon_url_1.url_uuid, page_url_uuid=page_url_1.url_uuid) bg_client.send(PickleMessage.from_obj(event)) event = NewIconFound(icon_url_uuid=icon_url_2.url_uuid, page_url_uuid=page_url_2.url_uuid) bg_client.send(PickleMessage.from_obj(event)) url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join( SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter( SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all()) assert url_icon_obj_1.icon == url_icon_obj_2.icon assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes
def test_archives(signed_in_client, test_user): url = URL.from_string("http://example.com/") bm = make_bookmark(url=url, created=datetime(2018, 1, 3)) sync_bookmarks(signed_in_client, test_user, [bm]) archive_response = signed_in_client.get( flask.url_for( "quarchive.bookmark_archives", url_uuid=url.url_uuid, username=test_user.username, )) assert archive_response.status_code == 200
def bookmark_from_sqla(url: str, sqla_obj: SQLABookmark) -> Bookmark: return Bookmark( url=URL.from_string(url), created=sqla_obj.created, description=sqla_obj.description, updated=sqla_obj.updated, unread=sqla_obj.unread, deleted=sqla_obj.deleted, title=sqla_obj.title, tag_triples=frozenset( (btag.tag_obj.tag_name, btag.updated, btag.deleted) for btag in sqla_obj.bookmark_tag_objs), )
def upsert_url(session: Session, url_string: str) -> UUID: url = URL.from_string(url_string) url_stmt = (pg_insert(SQLAUrl.__table__).values( url_uuid=url.url_uuid, scheme=url.scheme, netloc=url.netloc, path=url.path, query=url.query, fragment=url.fragment, ).on_conflict_do_nothing( index_elements=["scheme", "netloc", "path", "query", "fragment"])) session.execute(url_stmt) return url.url_uuid
def test_upsert_metadata_wholly_new(session, mock_s3): """Test upsert_metadata called with a wholly new index""" url, crawl_req, _ = make_crawl_with_response(session) link_1 = URL.from_string("http://example.com/" + random_string() + "/more") link_2 = URL.from_string("http://example.com/" + random_string() + "/even-more") canon = URL.from_string("http://example.com/" + random_string() + "/index") metadata = HTMLMetadata( url=url.to_url(), # icons=[], # FIXME: try a page-level icon # title="Example page", # meta_desc="An example page", links={link_1, link_2}, canonical=canon, ) upsert_metadata(session, crawl_req.crawl_uuid, metadata) sqla_url_obj = session.query(SQLAUrl).filter( SQLAUrl.url_uuid == url.url_uuid).one() link_urls = {o.to_url_obj.to_url() for o in sqla_url_obj.links} assert link_urls == {link_1, link_2} assert sqla_url_obj.canonical_url_obj.canonical_url_uuid == canon.url_uuid
def test_simple(): url = URL.from_string("http://example.com/webpage-with-full-metadata.html") with open( path.join(test_data_path, "webpage-with-full-metadata.html"), "rb" ) as html_f: metadata = extract_metadata_from_html(url, html_f) text_words = set(WORDS_REGEX.findall(metadata.text)) # type: ignore assert "Simple" in text_words assert {"This", "is", "a", "basic", "html", "document"} <= text_words meta_words = set(WORDS_REGEX.findall(metadata.meta_desc)) # type: ignore assert {"some", "meta", "description"} == meta_words assert metadata.url == url assert set(metadata.icons) == set( [ Icon( url=URL.from_string("http://example.com/favicon.png"), scope=IconScope.PAGE, type="image/png", rel_text="icon", ), Icon( url=URL.from_string("http://example.com/favicon-somewhere.ico"), scope=IconScope.PAGE, rel_text="shortcut icon", ), Icon( url=URL.from_string("http://example.com/apple-touch-icon.png"), scope=IconScope.PAGE, rel_text="apple-touch-icon", sizes="152x152", ), Icon( url=URL.from_string("http://example.com/favicon-alternative.png"), scope=IconScope.PAGE, rel_text="alternate icon", ), ] ) assert metadata.canonical == URL.from_string("http://example.com/simple") assert metadata.title == "Simple" assert metadata.links == {URL.from_string("http://example.com/other")} assert metadata.meta_desc == "some meta description" assert metadata.headings != { "h1": ["This document"], "h2": ["Other documents"], }
def create_bookmark(username: str) -> flask.Response: owner = get_user_or_fail(db.session, username) # FIXME: sort out optional url_uuid require_access_or_fail( UserBookmarksAccessObject(user_uuid=owner.user_uuid), Access.WRITE, ) form = flask.request.form creation_time = datetime.utcnow().replace(tzinfo=timezone.utc) tag_triples = tag_triples_from_form(form) url_str = form["url"] try: # As it's a user entering this url, help them along with getting a # sufficiently canonicalised url url = URL.from_string(url_str, coerce_canonicalisation=True) except DisallowedSchemeException: log.warning("user tried to create url: %s (disallowed scheme)", url_str) flask.abort(400, "invalid url (disallowed scheme)") bookmark = Bookmark( url=url, title=form["title"], description=form["description"], unread="unread" in form, deleted=False, updated=creation_time, created=creation_time, tag_triples=tag_triples, ) url_uuid = set_bookmark(db.session, get_cache(), owner.user_uuid, bookmark) db.session.commit() publish_message( message_lib.BookmarkCreated(user_uuid=owner.user_uuid, url_uuid=url.url_uuid), environ["QM_RABBITMQ_BG_WORKER_TOPIC"], ) flask.flash("Bookmarked: %s" % bookmark.title) response = flask.make_response("Redirecting...", 303) response.headers["Location"] = flask.url_for( "quarchive.edit_bookmark_form", url_uuid=url_uuid, username=owner.username, ) return response
def test_crawl_when_no_response(session, http_client, requests_mock): url = URL.from_string("http://example.com/" + random_string()) upsert_url(session, url) requests_mock.add( responses.GET, url.to_string(), body=requests.exceptions.ConnectTimeout("connect timeout"), ) response = crawler.crawl(session, http_client, Request(verb=HTTPVerb.GET, url=url)) sql_request = session.query(SQLACrawlRequest).get(response.crawl_uuid) sql_response = session.query(CrawlResponse).get(response.crawl_uuid) assert sql_request is not None assert sql_response is None
def test_new_icon_found_for_page_icon(session, requests_mock, bg_client: TestAdapter[PickleMessage], mock_s3): """Test that when a new page icon is found (that doesn't match any existing icons) that it is retrieved, indexed and stored. """ url = URL.from_string(f"http://{random_string()}.example.com/") icon_url = url.follow("/favicon.png") image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url.to_string(), body=image_buff.read(), status=200, stream=True, ) requests_mock.start() upsert_url(session, url) upsert_url(session, icon_url) session.commit() event = NewIconFound(icon_url_uuid=icon_url.url_uuid, page_url_uuid=url.url_uuid) bg_client.send(PickleMessage.from_obj(event)) icon, url_icon = (session.query(Icon, URLIcon).join(URLIcon).filter( URLIcon.url_uuid == url.url_uuid).first()) assert icon.source_blake2b_hash == hash_bytes assert url_icon.url_uuid == url.url_uuid icon_bucket = file_storage.get_icon_bucket() (s3_obj, ) = list( icon_bucket.objects.filter(Prefix=f"{icon.icon_uuid}.png")) assert s3_obj.key == f"{icon.icon_uuid}.png" response = s3_obj.get() assert response["ResponseMetadata"]["HTTPHeaders"][ "content-type"] == "image/png"
def test_icon_uuids_url_icon(session, test_user, canonical_url): bm1 = make_bookmark() set_bookmark(session, test_user.user_uuid, bm1) if canonical_url: canonical_url = bm1.url.follow("canonical.html") upsert_url(session, canonical_url) session.add( CanonicalUrl( non_canonical_url_uuid=bm1.url.url_uuid, canonical_url_uuid=canonical_url.url_uuid, ) ) icon_url = URL.from_string("http://example.com/" + random_string() + "/icon.png") upsert_url(session, icon_url) random_hash = random_bytes(64) icon_uuid = record_page_icon(session, icon_url, bm1.url, random_hash) (bm1_view,) = (f for f in BookmarkViewQueryBuilder(session, test_user).execute()) assert bm1_view.icon_uuid == icon_uuid
def test_index_requested_new_page_and_new_page_icon(session, bg_worker, mock_s3, requests_mock): """Test that new pages are indexed properly and their icon is downloaded.""" icon_url = URL.from_string( f"http://{random_string()}.example.com/favicon.png") html = f""" <html> <head> <link rel="icon" type="image/png" href="{icon_url.to_string()}"> </head> </html> """ sqla_url, crawl_req, crawl_resp = make_crawl_with_response( session, response_body=BytesIO(html.encode("utf-8"))) session.commit() image_buff = random_image_fileobj() hash_bytes = hashlib.blake2b(image_buff.read()).digest() image_buff.seek(0) requests_mock.add( responses.GET, url=icon_url.to_string(), body=image_buff.read(), status=200, stream=True, ) bg_worker.send( PickleMessage.from_obj(IndexRequested(crawl_resp.crawl_uuid))) fulltext_exists = session.query( session.query(FullText).filter( FullText.crawl_uuid == crawl_req.crawl_uuid).exists()).scalar() icon_exists = session.query( session.query(Icon).filter( Icon.source_blake2b_hash == hash_bytes).exists()).scalar() assert fulltext_exists, "crawl not indexed!" assert icon_exists, "icon not crawled!"
def test_crawl_requested(session, bg_worker, mock_s3, requests_mock): url = URL.from_string("http://example.com/" + random_string()) requests_mock.add( responses.GET, url=url.to_string(), body="Hello!", status=200, stream=True, ) bg_worker.send( PickleMessage.from_obj( CrawlRequested( CrawlRequest( request=Request(HTTPVerb.GET, url=url), reason=BookmarkCrawlReason(), )))) response_exists = session.query( session.query(SQLACrawlResponse).join(SQLACrawlRequest).join(SQLAUrl). filter(SQLAUrl.url_uuid == url.url_uuid).exists()).scalar() assert response_exists
def create_bookmark() -> flask.Response: form = flask.request.form creation_time = datetime.utcnow().replace(tzinfo=timezone.utc) tag_triples = tag_triples_from_form(form) bookmark = Bookmark( url=URL.from_string(form["url"]), title=form["title"], description=form["description"], unread="unread" in form, deleted=False, updated=creation_time, created=creation_time, tag_triples=tag_triples, ) url_uuid = set_bookmark(db.session, get_current_user().user_uuid, bookmark) db.session.commit() flask.flash("Bookmarked: %s" % bookmark.title) response = flask.make_response("Redirecting...", 303) response.headers["Location"] = flask.url_for( "quarchive.edit_bookmark_form", url_uuid=url_uuid ) return response