Example #1
0
def test_user_netloc_page(signed_in_client, test_user):
    epoch_start = datetime(1970, 1, 1, tzinfo=timezone.utc)
    bm1 = make_bookmark(
        url=URL.from_string("http://pokemon.example.com/"),
        title="Pokemon",
        tag_triples=frozenset([("pokemon", epoch_start, False)]),
    )
    bm2 = make_bookmark(
        url=URL.from_string("http://digimon.example.com/"),
        title="Digimon",
        tag_triples=frozenset([("digimon", epoch_start, False)]),
    )

    sync_bookmarks(signed_in_client, test_user, [bm1, bm2])

    response = signed_in_client.get(
        flask.url_for(
            "quarchive.user_netloc",
            username=test_user.username,
            netloc="pokemon.example.com",
        )
    )
    assert response.status_code == 200

    (present,) = get_bookmarks_from_response(response)
    assert present["url"] == "http://pokemon.example.com/"
    assert present["title"] == "Pokemon"
Example #2
0
def test_most_recent_successful_crawls(session, cache, test_user):
    # Crawl 1 is an old crawl of url 1 (should not be present)
    # Crawl 2 is a more recent crawl of url 1
    # Crawl 3 is a crawl of url 3 that didn't get a response
    # Crawl 4 is a crawl that of url 4 returned a non-2xx status code
    # Only crawl 2 should be present

    url_1 = SQLAUrl.from_url(
        URL.from_string(f"http://example.com/{random_string()}"))
    bm_1 = make_bookmark(url=url_1.to_url())
    set_bookmark(session, cache, test_user.user_uuid, bm_1)
    crawl_req_1 = CrawlRequest(  # type: ignore
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 3),
        got_response=True,
        url_uuid=url_1.url_uuid,
        response_obj=CrawlResponse(body_uuid=uuid4(),
                                   headers={},
                                   status_code=200),
    )
    crawl_req_2 = CrawlRequest(  # type: ignore
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 4),
        got_response=True,
        url_uuid=url_1.url_uuid,
        response_obj=CrawlResponse(body_uuid=uuid4(),
                                   headers={},
                                   status_code=200),
    )
    url_3 = SQLAUrl.from_url(
        URL.from_string(f"http://example.com/{random_string()}"))
    bm_3 = make_bookmark(url=url_3.to_url())
    set_bookmark(session, cache, test_user.user_uuid, bm_3)
    crawl_req_3 = CrawlRequest(
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 3),
        got_response=False,
        url_uuid=url_3.url_uuid,
    )
    url_4 = SQLAUrl.from_url(
        URL.from_string(f"http://example.com/{random_string()}"))
    bm_4 = make_bookmark(url=url_3.to_url())
    set_bookmark(session, cache, test_user.user_uuid, bm_4)
    crawl_req_4 = CrawlRequest(  # type: ignore
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 3),
        got_response=False,
        url_uuid=url_3.url_uuid,
        response_obj=CrawlResponse(body_uuid=uuid4(),
                                   headers={},
                                   status_code=404),
    )
    session.add_all([crawl_req_1, crawl_req_2, crawl_req_3, crawl_req_4])
    session.commit()

    rv = set(most_recent_successful_bookmark_crawls(session))
    assert crawl_req_1.crawl_uuid not in rv
    assert crawl_req_2.crawl_uuid in rv
    assert crawl_req_3.crawl_uuid not in rv
    assert crawl_req_3.crawl_uuid not in rv
Example #3
0
def test_tags_page(signed_in_client, test_user):
    # FIXME: include deleted, etc
    epoch_start = datetime(1970, 1, 1, tzinfo=timezone.utc)
    bm1 = make_bookmark(
        url=URL.from_string("http://example.com/pokemon"),
        title="Pokemon",
        tag_triples=frozenset([("pokemon", epoch_start, False)]),
    )
    bm2 = make_bookmark(
        url=URL.from_string("http://example.com/digimon"),
        title="Digimon",
        tag_triples=frozenset([("digimon", epoch_start, False)]),
    )

    sync_bookmarks(signed_in_client, test_user, [bm1, bm2])

    response = signed_in_client.get(
        flask.url_for("quarchive.user_tags", username=test_user.username))
    assert response.status_code == 200

    html_parser = etree.HTMLParser()
    root = etree.fromstring(response.get_data(), html_parser)
    tags = set([e.text for e in CSSSelector(".tag-link")(root)])

    assert {"pokemon", "digimon"} == tags
 def url(self) -> URL:
     if self.discussion.source == DiscussionSource.HN:
         return URL.from_string(
             f"https://news.ycombinator.com/item?id={self.discussion.external_id}"
         )
     else:
         return URL.from_string(
             f"https://old.reddit.com/{self.discussion.external_id}")
Example #5
0
def test_crawl_when_response_is_recieved(session, http_client, status_code,
                                         mock_s3, requests_mock):
    url = URL.from_string("http://example.com/" + random_string())
    upsert_url(session, url)

    requests_mock.add(responses.GET,
                      url.to_string(),
                      body=b"hello",
                      status=status_code,
                      stream=True)

    request = Request(verb=HTTPVerb.GET, url=url)
    response = crawler.crawl(session, http_client, request)

    sql_request = session.query(SQLACrawlRequest).get(response.crawl_uuid)
    sql_response = session.query(CrawlResponse).get(response.crawl_uuid)

    assert sql_request.requested == datetime(2018, 1, 3, tzinfo=timezone.utc)
    assert sql_request.got_response
    assert sql_response.status_code == status_code
    assert sql_response.crawl_uuid == sql_response.crawl_uuid
    assert sql_response.headers == {"content-type": "text/plain"}

    s3_obj = (file_storage.get_s3().Object(
        environ["QM_RESPONSE_BODY_BUCKET_NAME"],
        str(sql_response.body_uuid)).get())
    response_body = s3_obj["Body"].read()
    assert response_body == gzip.compress(b"hello")
Example #6
0
def test_from_string():
    url = URL.from_string("http://example.com/a?b=c#d")
    assert url.scheme == "http"
    assert url.netloc == "example.com"
    assert url.path == "/a"
    assert url.query == "b=c"
    assert url.fragment == "d"
Example #7
0
def best_icon(metadata: HTMLMetadata) -> Icon:
    """Will return the most suitable icon for our purposes, falling back to the
    domain level favicon.ico if nothing else is available."""
    # We don't currently consider SVG as we can't read them (yet)
    possible_icons = list(
        sorted(
            (i for i in metadata.icons if i.mimetype() != "image/svg+xml"),
            key=lambda i: (i.size_rank(), i.mimetype_rank()),
            reverse=True,
        ))
    if len(possible_icons) > 0:
        best_icon = possible_icons[0]
        if best_icon.size_rank() > 0:
            # If the size is wonky, skip it
            log.debug("picked %s as icon for %s", best_icon, metadata.url)
            return best_icon
    url = metadata.url
    fallback_icon = Icon(
        url=URL.from_string(f"{url.scheme}://{url.netloc}/favicon.ico"),
        rel_text="shortcut icon",
        scope=IconScope.DOMAIN,
    )

    log.debug("no icons found on %s, falling back favicon.ico", metadata.url)
    return fallback_icon
Example #8
0
def test_new_icon_found_for_page_url_duplicated_by_url(
        session, bg_client: TestAdapter[PickleMessage], mock_s3,
        requests_mock):
    """Test that when a new page icon is found that is the same icon by hash as
    an existing icon, that it is recorded."""
    page_url_1 = URL.from_string(
        f"http://{random_string()}.example.com/index.html")
    page_url_2 = page_url_1.follow("/otherindex.html")

    icon_url = page_url_1.follow("favicon1.png")

    hash_bytes = bytes(random.getrandbits(8) for _ in range(64))

    upsert_url(session, page_url_1)
    upsert_url(session, page_url_2)
    upsert_url(session, icon_url)
    icon_uuid = uuid4()
    session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes))
    session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid))
    session.add(URLIcon(url_uuid=page_url_1.url_uuid, icon_uuid=icon_uuid))
    session.commit()

    event = NewIconFound(icon_url_uuid=icon_url.url_uuid,
                         page_url_uuid=page_url_2.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))

    url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join(
        SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter(
            SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all())

    assert url_icon_obj_1.icon == url_icon_obj_2.icon
    assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes
Example #9
0
def test_new_icon_found_domain_but_is_already_indexed(
        session, requests_mock, bg_client: TestAdapter[PickleMessage],
        mock_s3):
    icon_url = URL.from_string(
        f"http://{random_string()}.example.com/favicon.ico")
    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )
    requests_mock.start()

    upsert_url(session, icon_url)
    session.commit()

    event = NewIconFound(icon_url_uuid=icon_url.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))
    bg_client.send(PickleMessage.from_obj(event))

    icon, domain_icon = (session.query(
        Icon, DomainIcon).join(DomainIcon).filter(
            DomainIcon.scheme == icon_url.scheme,
            DomainIcon.netloc == icon_url.netloc).one())
    assert icon.source_blake2b_hash == hash_bytes

    assert domain_icon.scheme == icon_url.scheme
    assert domain_icon.netloc == icon_url.netloc
Example #10
0
def test_index_requested_new_page_and_known_page_icon_url(
        session, bg_worker, mock_s3, requests_mock):
    """Test that when a page uses an icon url we already have in the index, we reuse it."""
    icon_url = URL.from_string(
        f"http://{random_string()}.example.com/favicon.png")
    icon_uuid = uuid4()
    html = f"""
    <html>
    <head>
    <link rel="icon" type="image/png" href="{icon_url.to_string()}">
    </head>
    </html>
    """

    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(
        session, response_body=BytesIO(html.encode("utf-8")))
    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    upsert_url(session, icon_url)
    session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes))
    session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid))
    session.commit()

    bg_worker.send(
        PickleMessage.from_obj(IndexRequested(crawl_resp.crawl_uuid)))

    url_icon = (session.query(URLIcon).filter(
        URLIcon.url_uuid == sqla_url.url_uuid).one())
    assert url_icon.url_uuid == sqla_url.url_uuid
    assert url_icon.icon_uuid == icon_uuid
Example #11
0
def reindex_url(url: str, log_level: str):
    url_obj = URL.from_string(url)
    Session = get_session_cls()
    with contextlib.closing(Session()) as session:
        crawl_uuid = get_most_recent_crawl(session, url_obj)
    publish_message(IndexRequested(crawl_uuid),
                    environ["QM_RABBITMQ_BG_WORKER_TOPIC"])
    log.info("requested index of %s (crawl_uuid: %s)", url_obj, crawl_uuid)
def test_calpaterson():
    url = URL.from_string("http://calpaterson.com/calpaterson.html")
    with open(path.join(test_data_path, "calpaterson.html"), "rb") as html_f:
        metadata = extract_metadata_from_html(url, html_f)

    words = WORDS_REGEX.findall(metadata.text)  # type: ignore
    # pass/fail
    assert len(words) > 0
Example #13
0
def get_bookmark_by_url(session: Session, user_uuid: UUID,
                        url_string: str) -> Optional[Bookmark]:
    url = URL.from_string(url_string)
    sqla_bookmark = (session.query(SQLABookmark).filter(
        SQLABookmark.user_uuid == user_uuid,
        SQLABookmark.url_uuid == url.url_uuid).first())
    if sqla_bookmark is None:
        return None
    return bookmark_from_sqla(url, sqla_bookmark)
 def _discussion_from_child_data(self, child_data: Mapping) -> Discussion:
     return Discussion(
         external_id=child_data["id"],
         source=DiscussionSource.REDDIT,
         url=URL.from_string(child_data["url"], coerce_canonicalisation=True),
         comment_count=child_data["num_comments"],
         created_at=datetime.utcfromtimestamp(child_data["created_utc"]),
         title=f'{child_data["subreddit_name_prefixed"]}: {child_data["title"]}',
     )
Example #15
0
def get_archive_links(url: URL,
                      circa: Optional[datetime] = None
                      ) -> Mapping[Archive, URL]:
    if circa is None:
        circa = datetime.utcnow().replace(tzinfo=timezone.utc)

    # This is the internet archive's timestamp format, which archive_today
    # helpfully also supports
    ia_timestamp = circa.strftime("%Y%m%d%H%M%S")

    links = {}
    links[Archive.WAYBACK] = URL.from_string(
        f"https://web.archive.org/web/{ia_timestamp}/{url.to_string()}")
    links[Archive.ARCHIVE_TODAY] = URL.from_string(
        f"https://archive.today/{ia_timestamp}/{url.to_string()}")
    links[Archive.GOOGLE_CACHE] = URL.from_string(
        f"https://webcache.googleusercontent.com/search?q=cache:{url.to_string()}"
    )
    return links
def extract_hn_discussions(response_body: Mapping) -> Iterator[Discussion]:
    log.debug("hn search api returned: %s", response_body)
    for hit in response_body["hits"]:
        yield Discussion(
            comment_count=hit.get("num_comments", 0) or 0,
            created_at=datetime.utcfromtimestamp(hit["created_at_i"]),
            external_id=hit["objectID"],
            title=hit.get("title", ""),
            url=URL.from_string(hit["url"], coerce_canonicalisation=True),
            source=DiscussionSource.HN,
        )
Example #17
0
def test_upsert_metadata_update(session, mock_s3):
    """Test upsert metadata with an new crawl where things have changed"""
    url, crawl_req, _ = make_crawl_with_response(session)
    link_1 = URL.from_string("http://example.com/" + random_string() + "/more")
    link_2 = URL.from_string("http://example.com/" + random_string() +
                             "/even-more")
    canon_1 = URL.from_string("http://example.com/" + random_string() +
                              "/index")

    metadata_1 = HTMLMetadata(
        url=url.to_url(),
        # icons=[],  # FIXME: try a page-level icon
        # title="Example page",
        # meta_desc="An example page",
        links={link_1, link_2},
        canonical=canon_1,
    )
    upsert_metadata(session, crawl_req.crawl_uuid, metadata_1)

    url, crawl_req, _ = make_crawl_with_response(session, url.to_url())
    link_3 = URL.from_string("http://example.com/" + random_string() +
                             "/yet-more")
    canon_2 = URL.from_string("http://example.com/" + random_string() +
                              "/index2")

    metadata_2 = HTMLMetadata(
        url=url.to_url(),
        # icons=[],  # FIXME: try a page-level icon
        # title="Example page",
        # meta_desc="An example page",
        canonical=canon_2,
        links={link_1, link_3},
    )
    upsert_metadata(session, crawl_req.crawl_uuid, metadata_2)

    sqla_url_obj = session.query(SQLAUrl).filter(
        SQLAUrl.url_uuid == url.url_uuid).one()
    link_urls = {o.to_url_obj.to_url() for o in sqla_url_obj.links}
    assert link_urls == {link_1, link_3}
    assert sqla_url_obj.canonical_url_obj.canonical_url_uuid == canon_2.url_uuid
Example #18
0
def test_new_icon_found_for_page_url_duplicated_by_content(
        session, requests_mock, bg_client: TestAdapter[PickleMessage],
        mock_s3):
    """Test that when a new page icon is found that is the same icon by hash as
    an existing icon, that it is recorded."""
    page_url_1 = URL.from_string(
        f"http://{random_string()}.example.com/index.html")
    page_url_2 = page_url_1.follow("/otherindex.html")

    icon_url_1 = page_url_1.follow("favicon1.png")
    icon_url_2 = page_url_2.follow("favicon2.png")

    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url_1.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url_2.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )
    requests_mock.start()

    upsert_url(session, page_url_1)
    upsert_url(session, page_url_2)
    upsert_url(session, icon_url_1)
    upsert_url(session, icon_url_2)
    session.commit()

    event = NewIconFound(icon_url_uuid=icon_url_1.url_uuid,
                         page_url_uuid=page_url_1.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))

    event = NewIconFound(icon_url_uuid=icon_url_2.url_uuid,
                         page_url_uuid=page_url_2.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))

    url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join(
        SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter(
            SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all())

    assert url_icon_obj_1.icon == url_icon_obj_2.icon
    assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes
Example #19
0
def test_archives(signed_in_client, test_user):
    url = URL.from_string("http://example.com/")
    bm = make_bookmark(url=url, created=datetime(2018, 1, 3))
    sync_bookmarks(signed_in_client, test_user, [bm])

    archive_response = signed_in_client.get(
        flask.url_for(
            "quarchive.bookmark_archives",
            url_uuid=url.url_uuid,
            username=test_user.username,
        ))

    assert archive_response.status_code == 200
Example #20
0
def bookmark_from_sqla(url: str, sqla_obj: SQLABookmark) -> Bookmark:
    return Bookmark(
        url=URL.from_string(url),
        created=sqla_obj.created,
        description=sqla_obj.description,
        updated=sqla_obj.updated,
        unread=sqla_obj.unread,
        deleted=sqla_obj.deleted,
        title=sqla_obj.title,
        tag_triples=frozenset(
            (btag.tag_obj.tag_name, btag.updated, btag.deleted)
            for btag in sqla_obj.bookmark_tag_objs),
    )
Example #21
0
def upsert_url(session: Session, url_string: str) -> UUID:
    url = URL.from_string(url_string)
    url_stmt = (pg_insert(SQLAUrl.__table__).values(
        url_uuid=url.url_uuid,
        scheme=url.scheme,
        netloc=url.netloc,
        path=url.path,
        query=url.query,
        fragment=url.fragment,
    ).on_conflict_do_nothing(
        index_elements=["scheme", "netloc", "path", "query", "fragment"]))
    session.execute(url_stmt)

    return url.url_uuid
Example #22
0
def test_upsert_metadata_wholly_new(session, mock_s3):
    """Test upsert_metadata called with a wholly new index"""
    url, crawl_req, _ = make_crawl_with_response(session)
    link_1 = URL.from_string("http://example.com/" + random_string() + "/more")
    link_2 = URL.from_string("http://example.com/" + random_string() +
                             "/even-more")
    canon = URL.from_string("http://example.com/" + random_string() + "/index")

    metadata = HTMLMetadata(
        url=url.to_url(),
        # icons=[],  # FIXME: try a page-level icon
        # title="Example page",
        # meta_desc="An example page",
        links={link_1, link_2},
        canonical=canon,
    )
    upsert_metadata(session, crawl_req.crawl_uuid, metadata)

    sqla_url_obj = session.query(SQLAUrl).filter(
        SQLAUrl.url_uuid == url.url_uuid).one()
    link_urls = {o.to_url_obj.to_url() for o in sqla_url_obj.links}
    assert link_urls == {link_1, link_2}
    assert sqla_url_obj.canonical_url_obj.canonical_url_uuid == canon.url_uuid
def test_simple():
    url = URL.from_string("http://example.com/webpage-with-full-metadata.html")
    with open(
        path.join(test_data_path, "webpage-with-full-metadata.html"), "rb"
    ) as html_f:
        metadata = extract_metadata_from_html(url, html_f)

    text_words = set(WORDS_REGEX.findall(metadata.text))  # type: ignore
    assert "Simple" in text_words
    assert {"This", "is", "a", "basic", "html", "document"} <= text_words

    meta_words = set(WORDS_REGEX.findall(metadata.meta_desc))  # type: ignore
    assert {"some", "meta", "description"} == meta_words

    assert metadata.url == url
    assert set(metadata.icons) == set(
        [
            Icon(
                url=URL.from_string("http://example.com/favicon.png"),
                scope=IconScope.PAGE,
                type="image/png",
                rel_text="icon",
            ),
            Icon(
                url=URL.from_string("http://example.com/favicon-somewhere.ico"),
                scope=IconScope.PAGE,
                rel_text="shortcut icon",
            ),
            Icon(
                url=URL.from_string("http://example.com/apple-touch-icon.png"),
                scope=IconScope.PAGE,
                rel_text="apple-touch-icon",
                sizes="152x152",
            ),
            Icon(
                url=URL.from_string("http://example.com/favicon-alternative.png"),
                scope=IconScope.PAGE,
                rel_text="alternate icon",
            ),
        ]
    )
    assert metadata.canonical == URL.from_string("http://example.com/simple")
    assert metadata.title == "Simple"
    assert metadata.links == {URL.from_string("http://example.com/other")}
    assert metadata.meta_desc == "some meta description"
    assert metadata.headings != {
        "h1": ["This document"],
        "h2": ["Other documents"],
    }
Example #24
0
def create_bookmark(username: str) -> flask.Response:
    owner = get_user_or_fail(db.session, username)
    # FIXME: sort out optional url_uuid
    require_access_or_fail(
        UserBookmarksAccessObject(user_uuid=owner.user_uuid),
        Access.WRITE,
    )
    form = flask.request.form
    creation_time = datetime.utcnow().replace(tzinfo=timezone.utc)
    tag_triples = tag_triples_from_form(form)

    url_str = form["url"]
    try:
        # As it's a user entering this url, help them along with getting a
        # sufficiently canonicalised url
        url = URL.from_string(url_str, coerce_canonicalisation=True)
    except DisallowedSchemeException:
        log.warning("user tried to create url: %s (disallowed scheme)",
                    url_str)
        flask.abort(400, "invalid url (disallowed scheme)")

    bookmark = Bookmark(
        url=url,
        title=form["title"],
        description=form["description"],
        unread="unread" in form,
        deleted=False,
        updated=creation_time,
        created=creation_time,
        tag_triples=tag_triples,
    )
    url_uuid = set_bookmark(db.session, get_cache(), owner.user_uuid, bookmark)
    db.session.commit()
    publish_message(
        message_lib.BookmarkCreated(user_uuid=owner.user_uuid,
                                    url_uuid=url.url_uuid),
        environ["QM_RABBITMQ_BG_WORKER_TOPIC"],
    )
    flask.flash("Bookmarked: %s" % bookmark.title)
    response = flask.make_response("Redirecting...", 303)
    response.headers["Location"] = flask.url_for(
        "quarchive.edit_bookmark_form",
        url_uuid=url_uuid,
        username=owner.username,
    )
    return response
Example #25
0
def test_crawl_when_no_response(session, http_client, requests_mock):
    url = URL.from_string("http://example.com/" + random_string())
    upsert_url(session, url)

    requests_mock.add(
        responses.GET,
        url.to_string(),
        body=requests.exceptions.ConnectTimeout("connect timeout"),
    )

    response = crawler.crawl(session, http_client,
                             Request(verb=HTTPVerb.GET, url=url))

    sql_request = session.query(SQLACrawlRequest).get(response.crawl_uuid)
    sql_response = session.query(CrawlResponse).get(response.crawl_uuid)
    assert sql_request is not None
    assert sql_response is None
Example #26
0
def test_new_icon_found_for_page_icon(session, requests_mock,
                                      bg_client: TestAdapter[PickleMessage],
                                      mock_s3):
    """Test that when a new page icon is found (that doesn't match any existing
    icons) that it is retrieved, indexed and stored.

    """
    url = URL.from_string(f"http://{random_string()}.example.com/")
    icon_url = url.follow("/favicon.png")
    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )
    requests_mock.start()

    upsert_url(session, url)
    upsert_url(session, icon_url)
    session.commit()

    event = NewIconFound(icon_url_uuid=icon_url.url_uuid,
                         page_url_uuid=url.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))

    icon, url_icon = (session.query(Icon, URLIcon).join(URLIcon).filter(
        URLIcon.url_uuid == url.url_uuid).first())
    assert icon.source_blake2b_hash == hash_bytes

    assert url_icon.url_uuid == url.url_uuid

    icon_bucket = file_storage.get_icon_bucket()
    (s3_obj, ) = list(
        icon_bucket.objects.filter(Prefix=f"{icon.icon_uuid}.png"))
    assert s3_obj.key == f"{icon.icon_uuid}.png"
    response = s3_obj.get()
    assert response["ResponseMetadata"]["HTTPHeaders"][
        "content-type"] == "image/png"
Example #27
0
def test_icon_uuids_url_icon(session, test_user, canonical_url):
    bm1 = make_bookmark()
    set_bookmark(session, test_user.user_uuid, bm1)
    if canonical_url:
        canonical_url = bm1.url.follow("canonical.html")
        upsert_url(session, canonical_url)
        session.add(
            CanonicalUrl(
                non_canonical_url_uuid=bm1.url.url_uuid,
                canonical_url_uuid=canonical_url.url_uuid,
            )
        )

    icon_url = URL.from_string("http://example.com/" + random_string() + "/icon.png")
    upsert_url(session, icon_url)
    random_hash = random_bytes(64)
    icon_uuid = record_page_icon(session, icon_url, bm1.url, random_hash)

    (bm1_view,) = (f for f in BookmarkViewQueryBuilder(session, test_user).execute())
    assert bm1_view.icon_uuid == icon_uuid
Example #28
0
def test_index_requested_new_page_and_new_page_icon(session, bg_worker,
                                                    mock_s3, requests_mock):
    """Test that new pages are indexed properly and their icon is downloaded."""
    icon_url = URL.from_string(
        f"http://{random_string()}.example.com/favicon.png")
    html = f"""
    <html>
    <head>
    <link rel="icon" type="image/png" href="{icon_url.to_string()}">
    </head>
    </html>
    """

    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(
        session, response_body=BytesIO(html.encode("utf-8")))
    session.commit()

    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )

    bg_worker.send(
        PickleMessage.from_obj(IndexRequested(crawl_resp.crawl_uuid)))

    fulltext_exists = session.query(
        session.query(FullText).filter(
            FullText.crawl_uuid == crawl_req.crawl_uuid).exists()).scalar()

    icon_exists = session.query(
        session.query(Icon).filter(
            Icon.source_blake2b_hash == hash_bytes).exists()).scalar()

    assert fulltext_exists, "crawl not indexed!"
    assert icon_exists, "icon not crawled!"
Example #29
0
def test_crawl_requested(session, bg_worker, mock_s3, requests_mock):
    url = URL.from_string("http://example.com/" + random_string())
    requests_mock.add(
        responses.GET,
        url=url.to_string(),
        body="Hello!",
        status=200,
        stream=True,
    )

    bg_worker.send(
        PickleMessage.from_obj(
            CrawlRequested(
                CrawlRequest(
                    request=Request(HTTPVerb.GET, url=url),
                    reason=BookmarkCrawlReason(),
                ))))
    response_exists = session.query(
        session.query(SQLACrawlResponse).join(SQLACrawlRequest).join(SQLAUrl).
        filter(SQLAUrl.url_uuid == url.url_uuid).exists()).scalar()
    assert response_exists
Example #30
0
def create_bookmark() -> flask.Response:
    form = flask.request.form
    creation_time = datetime.utcnow().replace(tzinfo=timezone.utc)
    tag_triples = tag_triples_from_form(form)
    bookmark = Bookmark(
        url=URL.from_string(form["url"]),
        title=form["title"],
        description=form["description"],
        unread="unread" in form,
        deleted=False,
        updated=creation_time,
        created=creation_time,
        tag_triples=tag_triples,
    )
    url_uuid = set_bookmark(db.session, get_current_user().user_uuid, bookmark)
    db.session.commit()
    flask.flash("Bookmarked: %s" % bookmark.title)
    response = flask.make_response("Redirecting...", 303)
    response.headers["Location"] = flask.url_for(
        "quarchive.edit_bookmark_form", url_uuid=url_uuid
    )
    return response