Ejemplo n.º 1
0
def hn_turn_page(url: URL, response_body: Mapping) -> Optional[URL]:
    final_page = response_body["nbPages"] - 1
    current_page = response_body["page"]
    if current_page < final_page:
        q_dict = parse_qs(url.query)
        q_dict["page"] = current_page + 1
        new_url = url.follow("?" + urlencode(q_dict, doseq=True))
        return new_url
    return None
Ejemplo n.º 2
0
def extract_links(root, url: URL) -> Set[URL]:
    rv: Set[URL] = set()
    for anchor in root.xpath("//a"):
        if "href" in anchor.attrib:
            href: str = anchor.attrib["href"]
            try:
                rv.add(url.follow(href, coerce_canonicalisation=True))
            except URLException:
                log.debug("bad link: %s (from: %s)", href, url)
    return rv
Ejemplo n.º 3
0
def extract_canonical_link(root, url: URL) -> Optional[URL]:
    rel_canonicals = root.xpath("//head/link[@rel='canonical']")
    if len(rel_canonicals) > 0:
        if "href" in rel_canonicals[0].attrib:
            href = rel_canonicals[0].attrib["href"]
            try:
                return url.follow(href, coerce_canonicalisation=True)
            except URLException:
                log.debug("bad canonical link: %s (from %s)", href, url)
        else:
            log.debug("canonical link with no href on %s", url)
            return None
    log.debug("no canonical link found for %s", url)
    return None
Ejemplo n.º 4
0
def extract_icons(root, url: URL) -> Sequence[Icon]:
    icon_elements = root.xpath(
        "//head/link[(@rel='icon' or @rel='shortcut icon' or @rel='apple-touch-icon' or @rel='alternate icon')]"
    )
    icons = []
    for icon_element in icon_elements:
        icons.append(
            Icon(
                url=url.follow(icon_element.attrib.get("href"),
                               coerce_canonicalisation=True),
                scope=IconScope.PAGE,
                type=icon_element.attrib.get("type"),
                rel_text=icon_element.attrib["rel"],
                sizes=icon_element.attrib.get("sizes"),
            ))
    return icons