Python parse Examples, pip._vendor.html5lib.parse Python Examples

Example #1

0

Show file

def test_determine_base_url(html, url, expected):
    document = html5lib.parse(
        html,
        transport_encoding=None,
        namespaceHTMLElements=False,
    )
    assert _determine_base_url(document, url) == expected

Example #2

0

Show file

File: collector.py Project: wilsonfv/pip

def parse_links(
        html,  # type: bytes
        encoding,  # type: Optional[str]
        url,  # type: str
):
    # type: (...) -> Iterable[Link]
    """
    Parse an HTML document, and yield its anchor elements as Link objects.

    :param url: the URL from which the HTML was downloaded.
    """
    document = html5lib.parse(
        html,
        transport_encoding=encoding,
        namespaceHTMLElements=False,
    )
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(
            anchor,
            page_url=url,
            base_url=base_url,
        )
        if link is None:
            continue
        yield link

Example #3

0

Show file

File: test_collector.py Project: jsirois/pip

def test_parse_links_caches_same_page():
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>')
    html_bytes = html.encode('utf-8')

    page_1 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )
    page_2 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )

    mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
    with mock_parse as mock_parse:
        mock_parse.return_value = html5lib.parse(
            page_1.content,
            transport_encoding=page_1.encoding,
            namespaceHTMLElements=False,
        )
        parsed_links_1 = list(parse_links(page_1))
        mock_parse.assert_called()

    with mock_parse as mock_parse:
        parsed_links_2 = list(parse_links(page_2))
        assert parsed_links_2 == parsed_links_1
        mock_parse.assert_not_called()

Example #4

0

Show file

File: index.py Project: nicoddemus/pip

    def __init__(self, content, url, headers=None):
        # Determine if we have any encoding information in our headers
        encoding = None
        if headers and "Content-Type" in headers:
            content_type, params = cgi.parse_header(headers["Content-Type"])

            if "charset" in params:
                encoding = params["charset"]

        self.content = content
        self.parsed = html5lib.parse(self.content, encoding=encoding, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers

Example #5

0

Show file

 def iter_links(self):
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = _clean_link(urllib_parse.urljoin(base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self.url, requires_python=pyrequire)

Example #6

0

Show file

File: index.py Project: jaraco/pip

 def iter_links(self):
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = _clean_link(urllib_parse.urljoin(base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self.url, requires_python=pyrequire)

Example #7

0

Show file

File: index.py Project: timohouben/python_scripts

    def __init__(self, content, url, headers=None):
        # Determine if we have any encoding information in our headers
        encoding = None
        if headers and "Content-Type" in headers:
            content_type, params = cgi.parse_header(headers["Content-Type"])

            if "charset" in params:
                encoding = params["charset"]

        self.content = content
        self.parsed = html5lib.parse(self.content,
                                     transport_encoding=encoding,
                                     namespaceHTMLElements=False)
        self.url = url
        self.headers = headers

Example #8

0

Show file

def parse_links(page):
    # type: (HTMLPage) -> Iterable[Link]
    """
    Parse an HTML document, and yield its anchor elements as Link objects.
    """
    document = html5lib.parse(
        page.content, transport_encoding=page.encoding, namespaceHTMLElements=False,
    )

    url = page.url
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(anchor, page_url=url, base_url=base_url,)
        if link is None:
            continue
        yield link

Example #9

0

Show file

File: collector.py Project: wade1990/pip

 def iter_links(self):
     # type: () -> Iterable[Link]
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         link = _create_link_from_element(
             anchor,
             page_url=self.url,
             base_url=base_url,
         )
         if link is None:
             continue
         yield link

Example #10

0

Show file

    def search(self, query: str) -> SearchResult:
        pypi_simple = self.sources[0]["url"].rstrip("/")
        results = []

        if pypi_simple.endswith("/simple"):
            search_url = pypi_simple[:-6] + "search"
        else:
            search_url = pypi_simple + "/search"

        with self.environment.get_finder() as finder:
            session = finder.session
            resp = session.get(search_url, params={"q": query})
            if resp.status_code == 404:
                self.environment.project.core.ui.echo(
                    termui.yellow(
                        f"{pypi_simple!r} doesn't support '/search' endpoint, fallback "
                        f"to {self.DEFAULT_INDEX_URL!r} now.\n"
                        "This may take longer depending on your network condition."
                    ),
                    err=True,
                )
                resp = session.get(f"{self.DEFAULT_INDEX_URL}/search",
                                   params={"q": query})
            resp.raise_for_status()
            content = parse(resp.content, namespaceHTMLElements=False)

        for result in content.findall(".//*[@class='package-snippet']"):
            name = result.find("h3/*[@class='package-snippet__name']").text
            version = result.find(
                "h3/*[@class='package-snippet__version']").text

            if not name or not version:
                continue

            description = result.find(
                "p[@class='package-snippet__description']").text
            if not description:
                description = ""

            result = Package(name, version, description)
            results.append(result)

        return results

Example #11

0

Show file

File: collector.py Project: Kanyandula/BlogPost

def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
    """
    Parse an HTML document, and yield its anchor elements as Link objects.

    TODO: Remove when `html5lib` is dropped.
    """
    document = html5lib.parse(
        page.content,
        transport_encoding=page.encoding,
        namespaceHTMLElements=False,
    )

    url = page.url
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(
            anchor.attrib,
            page_url=url,
            base_url=base_url,
        )
        if link is None:
            continue
        yield link

Example #12

0

Show file

File: index.py Project: EnviroCentre/jython-upgrade

 def __init__(self, content, url, headers=None, trusted=None):
     self.content = content
     self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
     self.url = url
     self.headers = headers
     self.trusted = trusted

Example #13

0

Show file

 def __init__(self, content, url, headers=None, trusted=None):
     self.content = content
     self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
     self.url = url
     self.headers = headers
     self.trusted = trusted

Example #14

0

Show file

File: test_index.py Project: pypa/pip

def test_determine_base_url(html, url, expected):
    document = html5lib.parse(
        html, transport_encoding=None, namespaceHTMLElements=False,
    )
    assert _determine_base_url(document, url) == expected

Example #15

0

Show file

    return wrapper_wrapper


@with_cached_html_pages
def parse_links(page):
    # type: (HTMLPage) -> Iterable[Link]
    """
    Parse an HTML document, and yield its anchor elements as Link objects.
    """
<<<<<<< HEAD
    document = html5lib.bbc_parse(page.content)
=======
    document = html5lib.parse(
        page.content,
        transport_encoding=page.encoding,
        namespaceHTMLElements=False,
    )
>>>>>>> 241b678... create predictions

    url = page.url
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(
            anchor,
            page_url=url,
            base_url=base_url,
        )
        if link is None:
            continue
        yield link