Ejemplos de URL en Python, ejemplos de quarchive.value_objects.URL en Python

Ejemplo n.º 1

0

Mostrar archivo

def test_user_netloc_page(signed_in_client, test_user):
    epoch_start = datetime(1970, 1, 1, tzinfo=timezone.utc)
    bm1 = make_bookmark(
        url=URL.from_string("http://pokemon.example.com/"),
        title="Pokemon",
        tag_triples=frozenset([("pokemon", epoch_start, False)]),
    )
    bm2 = make_bookmark(
        url=URL.from_string("http://digimon.example.com/"),
        title="Digimon",
        tag_triples=frozenset([("digimon", epoch_start, False)]),
    )

    sync_bookmarks(signed_in_client, test_user, [bm1, bm2])

    response = signed_in_client.get(
        flask.url_for(
            "quarchive.user_netloc",
            username=test_user.username,
            netloc="pokemon.example.com",
        )
    )
    assert response.status_code == 200

    (present,) = get_bookmarks_from_response(response)
    assert present["url"] == "http://pokemon.example.com/"
    assert present["title"] == "Pokemon"

Ejemplo n.º 2

0

Mostrar archivo

def test_most_recent_successful_crawls(session, cache, test_user):
    # Crawl 1 is an old crawl of url 1 (should not be present)
    # Crawl 2 is a more recent crawl of url 1
    # Crawl 3 is a crawl of url 3 that didn't get a response
    # Crawl 4 is a crawl that of url 4 returned a non-2xx status code
    # Only crawl 2 should be present

    url_1 = SQLAUrl.from_url(
        URL.from_string(f"http://example.com/{random_string()}"))
    bm_1 = make_bookmark(url=url_1.to_url())
    set_bookmark(session, cache, test_user.user_uuid, bm_1)
    crawl_req_1 = CrawlRequest(  # type: ignore
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 3),
        got_response=True,
        url_uuid=url_1.url_uuid,
        response_obj=CrawlResponse(body_uuid=uuid4(),
                                   headers={},
                                   status_code=200),
    )
    crawl_req_2 = CrawlRequest(  # type: ignore
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 4),
        got_response=True,
        url_uuid=url_1.url_uuid,
        response_obj=CrawlResponse(body_uuid=uuid4(),
                                   headers={},
                                   status_code=200),
    )
    url_3 = SQLAUrl.from_url(
        URL.from_string(f"http://example.com/{random_string()}"))
    bm_3 = make_bookmark(url=url_3.to_url())
    set_bookmark(session, cache, test_user.user_uuid, bm_3)
    crawl_req_3 = CrawlRequest(
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 3),
        got_response=False,
        url_uuid=url_3.url_uuid,
    )
    url_4 = SQLAUrl.from_url(
        URL.from_string(f"http://example.com/{random_string()}"))
    bm_4 = make_bookmark(url=url_3.to_url())
    set_bookmark(session, cache, test_user.user_uuid, bm_4)
    crawl_req_4 = CrawlRequest(  # type: ignore
        crawl_uuid=uuid4(),
        requested=datetime(2018, 1, 3),
        got_response=False,
        url_uuid=url_3.url_uuid,
        response_obj=CrawlResponse(body_uuid=uuid4(),
                                   headers={},
                                   status_code=404),
    )
    session.add_all([crawl_req_1, crawl_req_2, crawl_req_3, crawl_req_4])
    session.commit()

    rv = set(most_recent_successful_bookmark_crawls(session))
    assert crawl_req_1.crawl_uuid not in rv
    assert crawl_req_2.crawl_uuid in rv
    assert crawl_req_3.crawl_uuid not in rv
    assert crawl_req_3.crawl_uuid not in rv

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_tags.py Proyecto: RQuintin/quarchive

def test_tags_page(signed_in_client, test_user):
    # FIXME: include deleted, etc
    epoch_start = datetime(1970, 1, 1, tzinfo=timezone.utc)
    bm1 = make_bookmark(
        url=URL.from_string("http://example.com/pokemon"),
        title="Pokemon",
        tag_triples=frozenset([("pokemon", epoch_start, False)]),
    )
    bm2 = make_bookmark(
        url=URL.from_string("http://example.com/digimon"),
        title="Digimon",
        tag_triples=frozenset([("digimon", epoch_start, False)]),
    )

    sync_bookmarks(signed_in_client, test_user, [bm1, bm2])

    response = signed_in_client.get(
        flask.url_for("quarchive.user_tags", username=test_user.username))
    assert response.status_code == 200

    html_parser = etree.HTMLParser()
    root = etree.fromstring(response.get_data(), html_parser)
    tags = set([e.text for e in CSSSelector(".tag-link")(root)])

    assert {"pokemon", "digimon"} == tags

Ejemplo n.º 4

0

Mostrar archivo

Archivo: discussion_functions.py Proyecto: calpaterson/quarchive

 def url(self) -> URL:
     if self.discussion.source == DiscussionSource.HN:
         return URL.from_string(
             f"https://news.ycombinator.com/item?id={self.discussion.external_id}"
         )
     else:
         return URL.from_string(
             f"https://old.reddit.com/{self.discussion.external_id}")

Ejemplo n.º 5

0

Mostrar archivo

def test_new_icon_found_for_page_url_duplicated_by_url(
        session, bg_client: TestAdapter[PickleMessage], mock_s3,
        requests_mock):
    """Test that when a new page icon is found that is the same icon by hash as
    an existing icon, that it is recorded."""
    page_url_1 = URL.from_string(
        f"http://{random_string()}.example.com/index.html")
    page_url_2 = page_url_1.follow("/otherindex.html")

    icon_url = page_url_1.follow("favicon1.png")

    hash_bytes = bytes(random.getrandbits(8) for _ in range(64))

    upsert_url(session, page_url_1)
    upsert_url(session, page_url_2)
    upsert_url(session, icon_url)
    icon_uuid = uuid4()
    session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes))
    session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid))
    session.add(URLIcon(url_uuid=page_url_1.url_uuid, icon_uuid=icon_uuid))
    session.commit()

    event = NewIconFound(icon_url_uuid=icon_url.url_uuid,
                         page_url_uuid=page_url_2.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))

    url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join(
        SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter(
            SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all())

    assert url_icon_obj_1.icon == url_icon_obj_2.icon
    assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes

Ejemplo n.º 6

0

Mostrar archivo

def test_index_requested_new_page_and_known_page_icon_url(
        session, bg_worker, mock_s3, requests_mock):
    """Test that when a page uses an icon url we already have in the index, we reuse it."""
    icon_url = URL.from_string(
        f"http://{random_string()}.example.com/favicon.png")
    icon_uuid = uuid4()
    html = f"""
    <html>
    <head>
    <link rel="icon" type="image/png" href="{icon_url.to_string()}">
    </head>
    </html>
    """

    sqla_url, crawl_req, crawl_resp = make_crawl_with_response(
        session, response_body=BytesIO(html.encode("utf-8")))
    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    upsert_url(session, icon_url)
    session.add(Icon(icon_uuid=icon_uuid, source_blake2b_hash=hash_bytes))
    session.add(IconSource(icon_uuid=icon_uuid, url_uuid=icon_url.url_uuid))
    session.commit()

    bg_worker.send(
        PickleMessage.from_obj(IndexRequested(crawl_resp.crawl_uuid)))

    url_icon = (session.query(URLIcon).filter(
        URLIcon.url_uuid == sqla_url.url_uuid).one())
    assert url_icon.url_uuid == sqla_url.url_uuid
    assert url_icon.icon_uuid == icon_uuid

Ejemplo n.º 7

0

Mostrar archivo

def test_new_icon_found_domain_but_is_already_indexed(
        session, requests_mock, bg_client: TestAdapter[PickleMessage],
        mock_s3):
    icon_url = URL.from_string(
        f"http://{random_string()}.example.com/favicon.ico")
    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )
    requests_mock.start()

    upsert_url(session, icon_url)
    session.commit()

    event = NewIconFound(icon_url_uuid=icon_url.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))
    bg_client.send(PickleMessage.from_obj(event))

    icon, domain_icon = (session.query(
        Icon, DomainIcon).join(DomainIcon).filter(
            DomainIcon.scheme == icon_url.scheme,
            DomainIcon.netloc == icon_url.netloc).one())
    assert icon.source_blake2b_hash == hash_bytes

    assert domain_icon.scheme == icon_url.scheme
    assert domain_icon.netloc == icon_url.netloc

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_crawl.py Proyecto: calpaterson/quarchive

def test_crawl_when_response_is_recieved(session, http_client, status_code,
                                         mock_s3, requests_mock):
    url = URL.from_string("http://example.com/" + random_string())
    upsert_url(session, url)

    requests_mock.add(responses.GET,
                      url.to_string(),
                      body=b"hello",
                      status=status_code,
                      stream=True)

    request = Request(verb=HTTPVerb.GET, url=url)
    response = crawler.crawl(session, http_client, request)

    sql_request = session.query(SQLACrawlRequest).get(response.crawl_uuid)
    sql_response = session.query(CrawlResponse).get(response.crawl_uuid)

    assert sql_request.requested == datetime(2018, 1, 3, tzinfo=timezone.utc)
    assert sql_request.got_response
    assert sql_response.status_code == status_code
    assert sql_response.crawl_uuid == sql_response.crawl_uuid
    assert sql_response.headers == {"content-type": "text/plain"}

    s3_obj = (file_storage.get_s3().Object(
        environ["QM_RESPONSE_BODY_BUCKET_NAME"],
        str(sql_response.body_uuid)).get())
    response_body = s3_obj["Body"].read()
    assert response_body == gzip.compress(b"hello")

Ejemplo n.º 9

0

Mostrar archivo

def best_icon(metadata: HTMLMetadata) -> Icon:
    """Will return the most suitable icon for our purposes, falling back to the
    domain level favicon.ico if nothing else is available."""
    # We don't currently consider SVG as we can't read them (yet)
    possible_icons = list(
        sorted(
            (i for i in metadata.icons if i.mimetype() != "image/svg+xml"),
            key=lambda i: (i.size_rank(), i.mimetype_rank()),
            reverse=True,
        ))
    if len(possible_icons) > 0:
        best_icon = possible_icons[0]
        if best_icon.size_rank() > 0:
            # If the size is wonky, skip it
            log.debug("picked %s as icon for %s", best_icon, metadata.url)
            return best_icon
    url = metadata.url
    fallback_icon = Icon(
        url=URL.from_string(f"{url.scheme}://{url.netloc}/favicon.ico"),
        rel_text="shortcut icon",
        scope=IconScope.DOMAIN,
    )

    log.debug("no icons found on %s, falling back favicon.ico", metadata.url)
    return fallback_icon

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_url.py Proyecto: RQuintin/quarchive

def test_from_string():
    url = URL.from_string("http://example.com/a?b=c#d")
    assert url.scheme == "http"
    assert url.netloc == "example.com"
    assert url.path == "/a"
    assert url.query == "b=c"
    assert url.fragment == "d"

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_html_metadata.py Proyecto: calpaterson/quarchive

def test_calpaterson():
    url = URL.from_string("http://calpaterson.com/calpaterson.html")
    with open(path.join(test_data_path, "calpaterson.html"), "rb") as html_f:
        metadata = extract_metadata_from_html(url, html_f)

    words = WORDS_REGEX.findall(metadata.text)  # type: ignore
    # pass/fail
    assert len(words) > 0

Ejemplo n.º 12

0

Mostrar archivo

def reindex_url(url: str, log_level: str):
    url_obj = URL.from_string(url)
    Session = get_session_cls()
    with contextlib.closing(Session()) as session:
        crawl_uuid = get_most_recent_crawl(session, url_obj)
    publish_message(IndexRequested(crawl_uuid),
                    environ["QM_RABBITMQ_BG_WORKER_TOPIC"])
    log.info("requested index of %s (crawl_uuid: %s)", url_obj, crawl_uuid)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: models.py Proyecto: bookmarktools/quarchive

 def to_url(self) -> URL:
     return URL(
         self.url_uuid,
         self.scheme,
         self.netloc,
         self.path,
         self.query,
         self.fragment,
     )

Ejemplo n.º 14

0

Mostrar archivo

Archivo: discussion_clients.py Proyecto: calpaterson/quarchive

 def _discussion_from_child_data(self, child_data: Mapping) -> Discussion:
     return Discussion(
         external_id=child_data["id"],
         source=DiscussionSource.REDDIT,
         url=URL.from_string(child_data["url"], coerce_canonicalisation=True),
         comment_count=child_data["num_comments"],
         created_at=datetime.utcfromtimestamp(child_data["created_utc"]),
         title=f'{child_data["subreddit_name_prefixed"]}: {child_data["title"]}',
     )

Ejemplo n.º 15

0

Mostrar archivo

Archivo: functions.py Proyecto: bookmarktools/quarchive

def get_bookmark_by_url(session: Session, user_uuid: UUID,
                        url_string: str) -> Optional[Bookmark]:
    url = URL.from_string(url_string)
    sqla_bookmark = (session.query(SQLABookmark).filter(
        SQLABookmark.user_uuid == user_uuid,
        SQLABookmark.url_uuid == url.url_uuid).first())
    if sqla_bookmark is None:
        return None
    return bookmark_from_sqla(url, sqla_bookmark)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: discussion_clients.py Proyecto: calpaterson/quarchive

def hn_turn_page(url: URL, response_body: Mapping) -> Optional[URL]:
    final_page = response_body["nbPages"] - 1
    current_page = response_body["page"]
    if current_page < final_page:
        q_dict = parse_qs(url.query)
        q_dict["page"] = current_page + 1
        new_url = url.follow("?" + urlencode(q_dict, doseq=True))
        return new_url
    return None

Ejemplo n.º 17

0

Mostrar archivo

def extract_links(root, url: URL) -> Set[URL]:
    rv: Set[URL] = set()
    for anchor in root.xpath("//a"):
        if "href" in anchor.attrib:
            href: str = anchor.attrib["href"]
            try:
                rv.add(url.follow(href, coerce_canonicalisation=True))
            except URLException:
                log.debug("bad link: %s (from: %s)", href, url)
    return rv

Ejemplo n.º 18

0

Mostrar archivo

def get_archive_links(url: URL,
                      circa: Optional[datetime] = None
                      ) -> Mapping[Archive, URL]:
    if circa is None:
        circa = datetime.utcnow().replace(tzinfo=timezone.utc)

    # This is the internet archive's timestamp format, which archive_today
    # helpfully also supports
    ia_timestamp = circa.strftime("%Y%m%d%H%M%S")

    links = {}
    links[Archive.WAYBACK] = URL.from_string(
        f"https://web.archive.org/web/{ia_timestamp}/{url.to_string()}")
    links[Archive.ARCHIVE_TODAY] = URL.from_string(
        f"https://archive.today/{ia_timestamp}/{url.to_string()}")
    links[Archive.GOOGLE_CACHE] = URL.from_string(
        f"https://webcache.googleusercontent.com/search?q=cache:{url.to_string()}"
    )
    return links

Ejemplo n.º 19

0

Mostrar archivo

Archivo: discussion_clients.py Proyecto: calpaterson/quarchive

def extract_hn_discussions(response_body: Mapping) -> Iterator[Discussion]:
    log.debug("hn search api returned: %s", response_body)
    for hit in response_body["hits"]:
        yield Discussion(
            comment_count=hit.get("num_comments", 0) or 0,
            created_at=datetime.utcfromtimestamp(hit["created_at_i"]),
            external_id=hit["objectID"],
            title=hit.get("title", ""),
            url=URL.from_string(hit["url"], coerce_canonicalisation=True),
            source=DiscussionSource.HN,
        )

Ejemplo n.º 20

0

Mostrar archivo

def test_upsert_metadata_update(session, mock_s3):
    """Test upsert metadata with an new crawl where things have changed"""
    url, crawl_req, _ = make_crawl_with_response(session)
    link_1 = URL.from_string("http://example.com/" + random_string() + "/more")
    link_2 = URL.from_string("http://example.com/" + random_string() +
                             "/even-more")
    canon_1 = URL.from_string("http://example.com/" + random_string() +
                              "/index")

    metadata_1 = HTMLMetadata(
        url=url.to_url(),
        # icons=[],  # FIXME: try a page-level icon
        # title="Example page",
        # meta_desc="An example page",
        links={link_1, link_2},
        canonical=canon_1,
    )
    upsert_metadata(session, crawl_req.crawl_uuid, metadata_1)

    url, crawl_req, _ = make_crawl_with_response(session, url.to_url())
    link_3 = URL.from_string("http://example.com/" + random_string() +
                             "/yet-more")
    canon_2 = URL.from_string("http://example.com/" + random_string() +
                              "/index2")

    metadata_2 = HTMLMetadata(
        url=url.to_url(),
        # icons=[],  # FIXME: try a page-level icon
        # title="Example page",
        # meta_desc="An example page",
        canonical=canon_2,
        links={link_1, link_3},
    )
    upsert_metadata(session, crawl_req.crawl_uuid, metadata_2)

    sqla_url_obj = session.query(SQLAUrl).filter(
        SQLAUrl.url_uuid == url.url_uuid).one()
    link_urls = {o.to_url_obj.to_url() for o in sqla_url_obj.links}
    assert link_urls == {link_1, link_3}
    assert sqla_url_obj.canonical_url_obj.canonical_url_uuid == canon_2.url_uuid

Ejemplo n.º 21

0

Mostrar archivo

def test_new_icon_found_for_page_url_duplicated_by_content(
        session, requests_mock, bg_client: TestAdapter[PickleMessage],
        mock_s3):
    """Test that when a new page icon is found that is the same icon by hash as
    an existing icon, that it is recorded."""
    page_url_1 = URL.from_string(
        f"http://{random_string()}.example.com/index.html")
    page_url_2 = page_url_1.follow("/otherindex.html")

    icon_url_1 = page_url_1.follow("favicon1.png")
    icon_url_2 = page_url_2.follow("favicon2.png")

    image_buff = random_image_fileobj()
    hash_bytes = hashlib.blake2b(image_buff.read()).digest()
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url_1.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )
    image_buff.seek(0)
    requests_mock.add(
        responses.GET,
        url=icon_url_2.to_string(),
        body=image_buff.read(),
        status=200,
        stream=True,
    )
    requests_mock.start()

    upsert_url(session, page_url_1)
    upsert_url(session, page_url_2)
    upsert_url(session, icon_url_1)
    upsert_url(session, icon_url_2)
    session.commit()

    event = NewIconFound(icon_url_uuid=icon_url_1.url_uuid,
                         page_url_uuid=page_url_1.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))

    event = NewIconFound(icon_url_uuid=icon_url_2.url_uuid,
                         page_url_uuid=page_url_2.url_uuid)
    bg_client.send(PickleMessage.from_obj(event))

    url_icon_obj_1, url_icon_obj_2 = (session.query(URLIcon).join(
        SQLAUrl, URLIcon.url_uuid == SQLAUrl.url_uuid).filter(
            SQLAUrl.netloc == page_url_1.netloc).order_by(SQLAUrl.path).all())

    assert url_icon_obj_1.icon == url_icon_obj_2.icon
    assert url_icon_obj_1.icon.source_blake2b_hash == hash_bytes

Ejemplo n.º 22

0

Mostrar archivo

def test_archives(signed_in_client, test_user):
    url = URL.from_string("http://example.com/")
    bm = make_bookmark(url=url, created=datetime(2018, 1, 3))
    sync_bookmarks(signed_in_client, test_user, [bm])

    archive_response = signed_in_client.get(
        flask.url_for(
            "quarchive.bookmark_archives",
            url_uuid=url.url_uuid,
            username=test_user.username,
        ))

    assert archive_response.status_code == 200

Ejemplo n.º 23

0

Mostrar archivo

def bookmark_from_sqla(url: str, sqla_obj: SQLABookmark) -> Bookmark:
    return Bookmark(
        url=URL.from_string(url),
        created=sqla_obj.created,
        description=sqla_obj.description,
        updated=sqla_obj.updated,
        unread=sqla_obj.unread,
        deleted=sqla_obj.deleted,
        title=sqla_obj.title,
        tag_triples=frozenset(
            (btag.tag_obj.tag_name, btag.updated, btag.deleted)
            for btag in sqla_obj.bookmark_tag_objs),
    )

Ejemplo n.º 24

0

Mostrar archivo

def upsert_url(session: Session, url_string: str) -> UUID:
    url = URL.from_string(url_string)
    url_stmt = (pg_insert(SQLAUrl.__table__).values(
        url_uuid=url.url_uuid,
        scheme=url.scheme,
        netloc=url.netloc,
        path=url.path,
        query=url.query,
        fragment=url.fragment,
    ).on_conflict_do_nothing(
        index_elements=["scheme", "netloc", "path", "query", "fragment"]))
    session.execute(url_stmt)

    return url.url_uuid

Ejemplo n.º 25

0

Mostrar archivo

def extract_canonical_link(root, url: URL) -> Optional[URL]:
    rel_canonicals = root.xpath("//head/link[@rel='canonical']")
    if len(rel_canonicals) > 0:
        if "href" in rel_canonicals[0].attrib:
            href = rel_canonicals[0].attrib["href"]
            try:
                return url.follow(href, coerce_canonicalisation=True)
            except URLException:
                log.debug("bad canonical link: %s (from %s)", href, url)
        else:
            log.debug("canonical link with no href on %s", url)
            return None
    log.debug("no canonical link found for %s", url)
    return None

Ejemplo n.º 26

0

Mostrar archivo

def test_upsert_metadata_wholly_new(session, mock_s3):
    """Test upsert_metadata called with a wholly new index"""
    url, crawl_req, _ = make_crawl_with_response(session)
    link_1 = URL.from_string("http://example.com/" + random_string() + "/more")
    link_2 = URL.from_string("http://example.com/" + random_string() +
                             "/even-more")
    canon = URL.from_string("http://example.com/" + random_string() + "/index")

    metadata = HTMLMetadata(
        url=url.to_url(),
        # icons=[],  # FIXME: try a page-level icon
        # title="Example page",
        # meta_desc="An example page",
        links={link_1, link_2},
        canonical=canon,
    )
    upsert_metadata(session, crawl_req.crawl_uuid, metadata)

    sqla_url_obj = session.query(SQLAUrl).filter(
        SQLAUrl.url_uuid == url.url_uuid).one()
    link_urls = {o.to_url_obj.to_url() for o in sqla_url_obj.links}
    assert link_urls == {link_1, link_2}
    assert sqla_url_obj.canonical_url_obj.canonical_url_uuid == canon.url_uuid

Ejemplo n.º 27

0

Mostrar archivo

def extract_icons(root, url: URL) -> Sequence[Icon]:
    icon_elements = root.xpath(
        "//head/link[(@rel='icon' or @rel='shortcut icon' or @rel='apple-touch-icon' or @rel='alternate icon')]"
    )
    icons = []
    for icon_element in icon_elements:
        icons.append(
            Icon(
                url=url.follow(icon_element.attrib.get("href"),
                               coerce_canonicalisation=True),
                scope=IconScope.PAGE,
                type=icon_element.attrib.get("type"),
                rel_text=icon_element.attrib["rel"],
                sizes=icon_element.attrib.get("sizes"),
            ))
    return icons

Ejemplo n.º 28

0

Mostrar archivo

Archivo: test_html_metadata.py Proyecto: calpaterson/quarchive

def test_simple():
    url = URL.from_string("http://example.com/webpage-with-full-metadata.html")
    with open(
        path.join(test_data_path, "webpage-with-full-metadata.html"), "rb"
    ) as html_f:
        metadata = extract_metadata_from_html(url, html_f)

    text_words = set(WORDS_REGEX.findall(metadata.text))  # type: ignore
    assert "Simple" in text_words
    assert {"This", "is", "a", "basic", "html", "document"} <= text_words

    meta_words = set(WORDS_REGEX.findall(metadata.meta_desc))  # type: ignore
    assert {"some", "meta", "description"} == meta_words

    assert metadata.url == url
    assert set(metadata.icons) == set(
        [
            Icon(
                url=URL.from_string("http://example.com/favicon.png"),
                scope=IconScope.PAGE,
                type="image/png",
                rel_text="icon",
            ),
            Icon(
                url=URL.from_string("http://example.com/favicon-somewhere.ico"),
                scope=IconScope.PAGE,
                rel_text="shortcut icon",
            ),
            Icon(
                url=URL.from_string("http://example.com/apple-touch-icon.png"),
                scope=IconScope.PAGE,
                rel_text="apple-touch-icon",
                sizes="152x152",
            ),
            Icon(
                url=URL.from_string("http://example.com/favicon-alternative.png"),
                scope=IconScope.PAGE,
                rel_text="alternate icon",
            ),
        ]
    )
    assert metadata.canonical == URL.from_string("http://example.com/simple")
    assert metadata.title == "Simple"
    assert metadata.links == {URL.from_string("http://example.com/other")}
    assert metadata.meta_desc == "some meta description"
    assert metadata.headings != {
        "h1": ["This document"],
        "h2": ["Other documents"],
    }

Ejemplo n.º 29

0

Mostrar archivo

Archivo: web_blueprint.py Proyecto: calpaterson/quarchive

def create_bookmark(username: str) -> flask.Response:
    owner = get_user_or_fail(db.session, username)
    # FIXME: sort out optional url_uuid
    require_access_or_fail(
        UserBookmarksAccessObject(user_uuid=owner.user_uuid),
        Access.WRITE,
    )
    form = flask.request.form
    creation_time = datetime.utcnow().replace(tzinfo=timezone.utc)
    tag_triples = tag_triples_from_form(form)

    url_str = form["url"]
    try:
        # As it's a user entering this url, help them along with getting a
        # sufficiently canonicalised url
        url = URL.from_string(url_str, coerce_canonicalisation=True)
    except DisallowedSchemeException:
        log.warning("user tried to create url: %s (disallowed scheme)",
                    url_str)
        flask.abort(400, "invalid url (disallowed scheme)")

    bookmark = Bookmark(
        url=url,
        title=form["title"],
        description=form["description"],
        unread="unread" in form,
        deleted=False,
        updated=creation_time,
        created=creation_time,
        tag_triples=tag_triples,
    )
    url_uuid = set_bookmark(db.session, get_cache(), owner.user_uuid, bookmark)
    db.session.commit()
    publish_message(
        message_lib.BookmarkCreated(user_uuid=owner.user_uuid,
                                    url_uuid=url.url_uuid),
        environ["QM_RABBITMQ_BG_WORKER_TOPIC"],
    )
    flask.flash("Bookmarked: %s" % bookmark.title)
    response = flask.make_response("Redirecting...", 303)
    response.headers["Location"] = flask.url_for(
        "quarchive.edit_bookmark_form",
        url_uuid=url_uuid,
        username=owner.username,
    )
    return response

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test_crawl.py Proyecto: calpaterson/quarchive

def test_crawl_when_no_response(session, http_client, requests_mock):
    url = URL.from_string("http://example.com/" + random_string())
    upsert_url(session, url)

    requests_mock.add(
        responses.GET,
        url.to_string(),
        body=requests.exceptions.ConnectTimeout("connect timeout"),
    )

    response = crawler.crawl(session, http_client,
                             Request(verb=HTTPVerb.GET, url=url))

    sql_request = session.query(SQLACrawlRequest).get(response.crawl_uuid)
    sql_response = session.query(CrawlResponse).get(response.crawl_uuid)
    assert sql_request is not None
    assert sql_response is None