Exemple #1
0
    def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings
Exemple #2
0
def parse_html(html, encoding="utf8"):
    if not html:
        return html
    if type(html) != unicode:
        html = html.decode(encoding)
    try:
        html_obj = etree.XML(html)
    except:
        try:
            parser = HTMLParser()
            parser.feed(html)
            html_obj = parser.close()
        except:
            try:
                html_obj = etree.HTML(html)
            except:
                html_obj = soupparser.fromstring(html)
    return html_obj
Exemple #3
0
 def _slice(resp: str, index: int = 1) -> GoogleResponse:
     utf8_parser = HTMLParser(encoding="utf-8")
     d = PyQuery(fromstring(resp, parser=utf8_parser))
     data = d.find(".g")
     pages = list(d.find("td").items())[1:-1]
     return GoogleResponse(data, pages, index)
Exemple #4
0
    .replace(' \u2022 ', '\xa0\u2022 ')  # add nbsp before bullet
    .replace('\xad', '')  # remove tab
)

drop_a = lambda el: [
    a.drop_tag() for a in el.xpath("descendant::a|descendant::img")
]


def remove_attr(el):
    for key in el.attrib:
        del el.attrib[key]


parser = HTMLParser(encoding='cp1251',
                    remove_blank_text=True,
                    remove_comments=True)


def books_from_category(cat, catalog):
    a = catalog.xpath("//a[@name='{}']".format(cat['slug']))[0]  # noqa
    head_tr = a.xpath('ancestor::tr[1]')[0]

    next_head_tr = head_tr.xpath(
        "following-sibling::tr[@bgcolor='#333399'][1]")
    if len(next_head_tr) > 0:
        next_head_tr = next_head_tr[0]
        return intersection(
            catalog,  # noqa
            set1=head_tr.xpath("following-sibling::tr[descendant::a]"),
            set2=next_head_tr.xpath("preceding-sibling::tr[descendant::a]"))
Exemple #5
0
import logging
from StringIO import StringIO
from lxml.html import HTMLParser, parse
from scrapy.http import HtmlResponse

logger = logging.getLogger()

_HTML_PARSER = HTMLParser(encoding='utf8')


class HtmlParser(object):
    def __call__(self, response):
        if not isinstance(response, HtmlResponse):
            return
        response.html = parse(
            StringIO(response.body_as_unicode().encode('utf8')), _HTML_PARSER)
Exemple #6
0
from ._compat import (
    to_bytes,
    to_unicode,
    unicode,
    unicode_compatible,
)
from .utils import (
    cached_property,
    ignored,
)

logger = logging.getLogger("breadability")

TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
UTF8_PARSER = HTMLParser(encoding="utf8")
CHARSET_META_TAG_PATTERN = re.compile(
    br"""<meta[^>]+charset=["']?([^'"/>\s]+)""", re.IGNORECASE)


def decode_html(html):
    """
    Converts bytes stream containing an HTML page into Unicode.
    Tries to guess character encoding from meta tag of by "chardet" library.
    """
    if isinstance(html, unicode):
        return html

    match = CHARSET_META_TAG_PATTERN.search(html)
    if match:
        declared_encoding = match.group(1).decode("ASCII")
import requests

url = 'http://www.safekorea.go.kr/idsiSFK/neo/sfk/cs/sfc/dis/disasterMsgView.jsp?menuSeq=679'
response = requests.get(url).text

from lxml.html import fromstring, tostring, HTMLParser
lxml_response = fromstring(response, parser=HTMLParser(encoding="utf-8"))
print(lxml_response)
lxml_table = lxml_response.xpath('//*[@id="cn"]/text()[1]')
print(lxml_table)
Exemple #8
0
def _make_etree(html,url,charset="UTF-8",absolute_links=True):
    from lxml.html import HTMLParser, document_fromstring
    parser = HTMLParser(encoding="UTF-8")
    root = document_fromstring(html, parser=parser, base_url=url)
    x = fetch_images(root)
Exemple #9
0
def make_etree(html, url):
    parser = HTMLParser(encoding='UTF-8')
    root = document_fromstring(html, parser=parser, base_url=url)
    return root
Exemple #10
0
def the_aaas_tango(pma, verify=True):
    '''Note: "verify" param recommended here (page navigation usually necessary).
    
         :param: pma (PubMedArticle object)
         :param: verify (bool) [default: True]
         :return: url (string)
         :raises: AccessDenied, NoPDFLink
    '''
    #try:
    #    pma = rectify_pma_for_vip_links(pma)
    #    pdfurl = aaas_format.format(ja=aaas_journals[pma.journal]['ja'], a=pma)
    #except NoPDFLink:
    # try the pmid-based approach
    baseurl = 'http://www.sciencemag.org/cgi/pmidlookup?view=long&pmid=%s' % pma.pmid
    res = requests.get(baseurl)
    pdfurl = res.url.replace('.long', '.full') + '.pdf'

    if not verify:
        return pdfurl

    response = requests.get(pdfurl)
    if response.status_code == 200 and response.headers['content-type'].find(
            'pdf') > -1:
        return response.url

    elif response.status_code == 200 and response.headers['content-type'].find(
            'html') > -1:
        tree = etree.fromstring(response.content, HTMLParser())

        if not tree.find('head/title').text.find('Sign In') > -1:
            raise NoPDFLink(
                'TXERROR: AAAS returned unexpected HTML response for url %s' %
                (pdfurl))
        else:
            # some items are acquirable via free account registration... but let's not mess with this just yet.
            raise NoPDFLink(
                'DENIED: AAAS paper subscription-only or requires site registration (url: %s)'
                % pdfurl)

        form = tree.cssselect('form')[0]
        fbi = form.fields.get('form_build_id')

        baseurl = urlsplit(response.url)
        post_url = baseurl.scheme + '://' + baseurl.hostname + form.action

        payload = {
            'pass': AAAS_PASSWORD,
            'name': AAAS_USERNAME,
            'form_build_id': fbi,
            'remember_me': 1
        }
        print("SUBMITTING TO AAAS")
        print(payload)

        response = requests.post(post_url, data=payload)
        if response.status_code == 403:
            return AccessDenied(
                'DENIED: AAAS subscription-only paper (url: %s)')
        elif response.headers['content-type'].find('pdf') > -1:
            return response.url
        elif response.headers['content-type'].find('html') > -1:
            #if response.content.find('access-denied') > -1:
            raise NoPDFLink('DENIED: AAAS subscription-only paper (url: %s)' %
                            pdfurl)
    else:
        raise NoPDFLink('TXERROR: AAAS returned %s for url %s' %
                        (response.status_code, pdfurl))
Exemple #11
0
    def parse(self, parser=None, base_url=None):
        """Parses the underlying html source using `lxml` library.

        This parsed tree is stored in :attr:`root` of this object.
        which could be used to perform numerous operations.

        Returns
        -------
            ElementTree
        """
        utx = self._get_utx()

        assert utx is not None, "UrlTransformer not Implemented."  # internal error
        assert utx.base_path is not None, "Base Path is not set!"
        assert utx.base_url is not None, "Base url is not Set!"
        if not isinstance(parser, HTMLParser):
            TypeError("Expected instance of <%r>, got <%r>" %
                      (HTMLParser, parser))

        if not parser:
            parser = HTMLParser(encoding=self.encoding, collect_ids=False)

        source = self.get_source()

        assert source is not None, "Source is not Set!"
        assert hasattr(source, 'read'), "File like object is required!"
        # assert self._element_factory is not None
        # assert hasattr(self._element_factory, 'make_element')
        LOGGER.info(
            'Parsing tree with source: <%r> encoding <%s> and parser <%r>' %
            (self._source, self.encoding, parser))

        context_tree = lxml_parse(source, parser=parser, base_url=base_url)
        # The tree generated by the parse is stored in the self.root
        # variable and can be utilised further for any number of use cases
        self._tree = context_tree
        self.root = context_tree.getroot()

        # if self.root is not None:
        #     # WaterMarking :)
        #     self.root.insert(0, Comment(MARK.format('', __version__, utx.url, utc_now(), '')))

        # There are internal links present on the html page which are files
        # that includes `#` and `javascript:` and 'data:base64;` type links
        # or a simple `/` url referring anchor tag
        # thus these links needs to be left as is.
        factory = getattr(self, 'make_element', None)
        assert callable(factory), "Element generator is not callable!"

        # Modify the tree elements
        for el in context_tree.iter():
            # A element can contain multiple urls
            for pack in self._handle_lxml_elem(el):

                if pack is not None:
                    elem, attr, url, pos = pack
                else:  # pragma: no cover
                    continue

                if elem is not None:
                    o = factory(elem, attr, url, pos)
                    if o is not None:
                        self._stack.append(o)

        self._parseComplete = True
        return self.root
Exemple #12
0
def _cleaned_html_tree(html: str) -> HtmlElement:
    parser = HTMLParser(encoding='utf8')
    tree = fromstring(html.encode('utf8'), parser=parser)
    return _clean_html(tree)
Exemple #13
0
    def search_filings(self):
        search_form = {'datePostedStart': datetime.strftime(self.start_date,
                                                            '%m/%d/%Y'),
                       'datePostedEnd': datetime.strftime(self.end_date,
                                                          '%m/%d/%Y')}

        _search_url_rgx = re.compile(r"window\.open\('(.*?)'\)",
                                     re.IGNORECASE)

        search_params = {'event': 'processSearchCriteria'}

        for filing_type in self.filing_types:
            _form = search_form.copy()
            _form['reportType'] = filing_type['code']
            _form.update(search_params)
            self.debug('making request with {f}'.format(f=_form))
            _, response = self.urlretrieve(
                self.base_url,
                method='POST',
                body=_form,
            )
            d = etree.fromstring(response.text, parser=HTMLParser())
            results = d.xpath('//*[@id="searchResults"]/tbody/tr')

            if len(results) >= 3000:
                error_msg = "More than 3000 results for params:\n{}".format(
                            json.dumps(search_params, indent=2))
                raise Exception(error_msg)

            # we're going to skip duplicate submissions on the same day
            results_seen = []

            for result in results:
                filing_type = result.xpath('td[3]')[0].text
                registrant_name = result.xpath('td[1]')[0].text
                client_name = result.xpath('td[2]')[0].text
                filing_date = result.xpath('td[5]')[0].text

                # this is how we define duplicates
                result_key = (registrant_name, client_name, filing_type, filing_date)
                if result_key not in results_seen:
                    try:
                        m = re.search(_search_url_rgx, result.attrib['onclick'])
                    except KeyError:
                        self.error('element {} has no onclick attribute'.format(
                            etree.tostring(result)))
                    try:
                        _doc_path = m.groups()[0]
                    except AttributeError:
                        self.error('no matches found for search_rgx')
                        self.debug('\n{r}\n{a}\n{u}'.format(
                            r=_search_url_rgx.pattern,
                            a=result.attrib['onclick'],
                            u=response.request.url
                        ))
                    _params = dict(parse_qsl(
                                   urlparse(_doc_path).query))
                    if _params:
                        results_seen.append(result_key)
                        yield _params
                    else:
                        self.error('unable to parse {}'.format(
                            etree.tostring(result)))
                else:
                    continue
def description_checks():
    args = parser.parse_args()

    # set up a basic logging config
    logger = logging.getLogger()
    if args.verbose == 1:
        logger.setLevel(logging.INFO)
    elif args.verbose >= 2:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.ERROR)

    files_to_check = []
    for f in args.file:
        if os.path.basename(f).startswith('DESCRIPTION.'):
            files_to_check.append(f)
        else:
            fb.error("'{}' is not a DESCRIPTION file.".format(f))
            continue

    if len(files_to_check) == 0:
        fb.error("None of the specified files "
                 "seem to be valid DESCRIPTION files.")
        exit(-1)

    for f in files_to_check:
        try:
            contents = open(f).read()
        except:
            fb.error("File '{}' does not exist.".format(f))
            continue

# ---------------------------------------------------------------------
        fb.new_check("Does DESCRIPTION file contain broken links ?")
        doc = defusedxml.lxml.fromstring(contents, parser=HTMLParser())
        broken_links = []
        for link in doc.xpath('//a/@href'):
            try:
                response = requests.head(link)
                if response.status_code != requests.codes.ok:
                    broken_links.append(link)
            except requests.exceptions.RequestException:
                broken_links.append(link)

        if len(broken_links) > 0:
            fb.error(("The following links are broken"
                      " in the DESCRIPTION file:"
                      " '{}'").format("', '".join(broken_links)))
        else:
            fb.ok("All links in the DESCRIPTION file look good!")

# ---------------------------------------------------------------------
        fb.new_check("Is this a propper HTML snippet ?")
        contenttype = magic.from_file(f)
        if "HTML" not in contenttype:
            data = open(f).read()
            if "<p>" in data and "</p>" in data:
              fb.ok(("{} is a propper"
                     " HTML snippet.").format(f))
            else:
              fb.error(("{} is not a propper"
                        " HTML snippet.").format(f))
        else:
            fb.ok("{} is a propper HTML file.".format(f))

# ---------------------------------------------------------------------
        fb.new_check("DESCRIPTION.en_us.html is more than 200 bytes ?")
        statinfo = os.stat(f)
        if statinfo.st_size <= 200:
            fb.error("{} must have size larger than 200 bytes".format(f))
        else:
            fb.ok("{} is larger than 200 bytes".format(f))

# ---------------------------------------------------------------------
        fb.new_check("DESCRIPTION.en_us.html is less than 1000 bytes ?")
        statinfo = os.stat(f)
        if statinfo.st_size >= 1000:
            fb.error("{} must have size smaller than 1000 bytes".format(f))
        else:
            fb.ok("{} is smaller than 1000 bytes".format(f))
        fb.save_json_report()
def parse_from_string(s):
    parser = HTMLParser()
    tree = etree.fromstring(s, parser=parser)
    return parse_tree(tree)
def parse_file(fn):
    parser = HTMLParser()
    tree = etree.parse(fn, parser=parser).getroot()
    return parse_tree(tree)
from lxml.html import parse, HTMLParser
import scraperwiki

parser = HTMLParser(encoding='utf-8')

page = parse('http://www.sescsp.org.br/sesc/programa_new/busca.cfm',
             parser).getroot()

for box in page.cssselect('#box'):
    if box.cssselect('.tit a'):
        evento = {
            'title': box.cssselect('.tit a')[0].text_content(),
            'url': box.cssselect('.tit a')[0].get('href')
        }
        scraperwiki.sqlite.save(['url'], evento)
from lxml.html import parse, HTMLParser
import scraperwiki

parser = HTMLParser(encoding='utf-8')

page = parse('http://www.sescsp.org.br/sesc/programa_new/busca.cfm',
             parser).getroot()

for box in page.cssselect('#box'):
    if box.cssselect('.tit a'):
        evento = {
            'title': box.cssselect('.tit a')[0].text_content(),
            'url': box.cssselect('.tit a')[0].get('href')
        }
        scraperwiki.sqlite.save(['url'], evento)
Exemple #18
0
 def _slice(resp: str) -> IqdbResponse:
     utf8_parser = HTMLParser(encoding="utf-8")
     d = PyQuery(fromstring(resp, parser=utf8_parser))
     return IqdbResponse(d)
Exemple #19
0
 def _make_tree(self, fstring):
     root = etree.fromstring(
         fstring, parser=HTMLParser(encoding=get_encoding(fstring)))
     return root
    def scrape_super_pacs(self):
        def reformat_date(datestring):
            dt = datetime.strptime(datestring, '%m/%d/%Y')
            return dt.strftime('%Y-%m-%d')

        def find_the_table(html_document):
            return html_document.xpath(
                '//*[@id="ctl00_ctl00_MainPanel"]/table')[0]

        def separate_name_and_address(cell):
            name = cell.text
            address = ', '.join([br.tail for br in cell.xpath('br')])
            return name, address

        def scrape_table(table_element):
            scraped_rows = []
            for row in table_element.xpath('tr'):
                _data = {}
                columns = row.xpath('td')
                if len(columns) == 5:
                    _data['org_id'] = columns[0].text_content()
                    _name, _address = separate_name_and_address(columns[1])
                    _data['org_name'] = _name
                    _data['org_address'] = _address
                    _data['org_phone'] = columns[2].text_content()
                    _data['org_begin_date'] = reformat_date(
                        columns[3].text_content())
                    _data['org_end_date'] = reformat_date(
                        columns[4].text_content())
                    scraped_rows.append(_data)
            return scraped_rows

        PAC_LIST_URL = "http://apps.azsos.gov/apps/election/cfs/search/SuperPACList.aspx"

        tmp, resp = self.urlretrieve(PAC_LIST_URL)

        html_document = etree.fromstring(resp.content, parser=HTMLParser())

        target_table = find_the_table(html_document)

        results = scrape_table(target_table)

        for result in results:

            _org = Organization(name=result['org_name'],
                                classification='political action committee',
                                founding_date=result['org_begin_date'],
                                dissolution_date=result['org_end_date'])

            _org.add_identifier(identifier=result['org_id'],
                                scheme='urn:az-state:committee')

            _org.add_contact_detail(type='address',
                                    value=result['org_address'])

            _org.add_contact_detail(type='voice', value=result['org_phone'])

            _org.add_source(url=PAC_LIST_URL)

            _org.source_identified = True

            yield _org
Exemple #21
0
import codecs
import os.path
import re
from datetime import datetime

import js2py
import pyquery
import requests
from lxml.html import HTMLParser, fromstring
from tzlocal import get_localzone

from libs.comic_downloader import Host
from .ComicInfo import ComicInfo
from .StreamLine import *

UTF8_PARSER = HTMLParser(encoding='utf-8')


def reverse(queue):
    left = 0
    right = queue_length(queue) - 1

    while left < right:
        swap(queue, left, right)
        left += 1
        right -= 1


class DmedenHost(Host):
    LOCAL_TZ = get_localzone()
    BASE_URL = "http://www.dmeden.com/"
Exemple #22
0
    def append_html_table(self, html_string, start_row=1, start_col=1):
        html_string = document_fromstring(html_string,
                                          HTMLParser(encoding='utf8'))

        last_row = start_row - 1
        last_col = start_col

        for table_el in html_string.xpath('//table'):
            last_row += 1

            for row_i, row in enumerate(table_el.xpath('./tr'),
                                        start=last_row):
                for col_i, col in enumerate(row.xpath('./td|./th'),
                                            start=last_col):
                    colspan = int(col.get('colspan', 0))
                    rowspan = int(col.get('rowspan', 0))

                    font_bold = False
                    font_size = 11
                    font_color = BLACK_COLOR

                    if rowspan:
                        rowspan -= 1
                    if colspan:
                        colspan -= 1

                    col_data = col.text_content().encode("utf8")

                    valign = 'center' if col_i == start_col and col.tag != 'th' else 'top'

                    while (row_i, col_i) in self.list:
                        col_i += 1

                    cell = self.worksheet.cell(row=row_i, column=col_i)
                    if rowspan or colspan:
                        self.worksheet.merge_cells(start_row=row_i,
                                                   end_row=row_i + rowspan,
                                                   start_column=col_i,
                                                   end_column=col_i + colspan)
                    cell.value = col_data
                    cell.alignment = Alignment(
                        horizontal=row.get('align', col.get('align'))
                        or 'left',
                        vertical=row.get('valign', col.get('valign'))
                        or valign,
                        shrink_to_fit=True,
                        wrap_text=True)

                    bgcolor = row.get('bgcolor', col.get('bgcolor'))

                    if bgcolor:
                        cell.fill = PatternFill(fill_type='solid',
                                                start_color=bgcolor,
                                                end_color=bgcolor)

                    for el in col.iter():
                        if el.tag == 'font':
                            font_color = el.get('color')
                        elif el.tag == 'b':
                            font_bold = True
                        elif el.tag in _TEXT_SIZE:
                            font_bold = True,
                            font_size = _TEXT_SIZE.get(el)

                    cell.font = Font(
                        color=font_color,
                        bold=font_bold,
                        size=font_size,
                    )

                    if col.tag == 'th':
                        cell.font = Font(bold=True)
                        cell.fill = PatternFill(fill_type='solid',
                                                start_color=TH_COLOR,
                                                end_color=TH_COLOR)

                    for i in range(0, rowspan + 1, 1):
                        for j in range(0, colspan + 1, 1):
                            if i == rowspan:
                                last_row = row_i + i
                            self.list.append((row_i + i, col_i + j))
                            cell = self.worksheet.cell(row=row_i + i,
                                                       column=col_i + j)
                            cell.border = Border(
                                left=Side(border_style=_BORDER_STYLE.get(
                                    table_el.get('border') or None),
                                          color=BORDER_COLOR),
                                right=Side(border_style=_BORDER_STYLE.get(
                                    table_el.get('border') or None),
                                           color=BORDER_COLOR),
                                top=Side(border_style=_BORDER_STYLE.get(
                                    table_el.get('border') or None),
                                         color=BORDER_COLOR),
                                bottom=Side(border_style=_BORDER_STYLE.get(
                                    table_el.get('border') or None),
                                            color=BORDER_COLOR),
                            )
        return last_row, last_col
Exemple #23
0
from .settings import MAX_FILE_SIZE, MIN_FILE_SIZE

LOGGER = logging.getLogger(__name__)

UNICODE_ALIASES = {'utf-8', 'utf_8'}

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
RETRY_STRATEGY = urllib3.util.Retry(
    total=3,
    connect=0,
    status_forcelist=[429, 500, 502, 503, 504],
)
HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY)

HTML_PARSER = HTMLParser(collect_ids=False,
                         default_doctype=False,
                         encoding='utf-8',
                         remove_pis=True)


def isutf8(data):
    """Simple heuristic to determine if a bytestring uses standard unicode encoding"""
    try:
        data.decode('UTF-8')
    except UnicodeDecodeError:
        return False
    else:
        return True


def detect_encoding(bytesobject):
    """Read all input or first chunk and return a list of encodings"""
Exemple #24
0
def make_etree(html, url):
    from lxml.html import HTMLParser, document_fromstring
    parser = HTMLParser(encoding="UTF-8")
    root = document_fromstring(html, parser=parser, base_url=url)
    return root
Exemple #25
0
 def _make_tree(self, fstring):
     root = etree.fromstring(fstring, parser=HTMLParser(encoding=get_encoding(fstring, guesses='utf-8', is_html=True)))
     return root
def parse(html,
          transport_encoding=None,
          namespace_elements=False,
          treebuilder='lxml',
          fallback_encoding=None,
          keep_doctype=True,
          maybe_xhtml=False,
          return_root=True,
          line_number_attr=None,
          sanitize_names=True,
          stack_size=16 * 1024):
    '''
    Parse the specified :attr:`html` and return the parsed representation.

    :param html: The HTML to be parsed. Can be either bytes or a unicode string.

    :param transport_encoding: If specified, assume the passed in bytes are in this encoding.
        Ignored if :attr:`html` is unicode.

    :param namespace_elements:
        Add XML namespaces when parsing so that the resulting tree is XHTML.

    :param treebuilder:
        The type of tree to return. Note that only the lxml treebuilder is fast, as all
        other treebuilders are implemented in python, not C. Supported values are:
          * `lxml <https://lxml.de>`_  -- the default, and fastest
          * `lxml_html <https://lxml.de>`_  -- tree of lxml.html.HtmlElement, same speed as lxml
          * etree (the python stdlib :mod:`xml.etree.ElementTree`)
          * dom (the python stdlib :mod:`xml.dom.minidom`)
          * `soup <https://www.crummy.com/software/BeautifulSoup>`_ -- BeautifulSoup,
            which must be installed or it will raise an :class:`ImportError`

    :param fallback_encoding: If no encoding could be detected, then use this encoding.
        Defaults to an encoding based on system locale.

    :param keep_doctype: Keep the <DOCTYPE> (if any).

    :param maybe_xhtml: Useful when it is unknown if the HTML to be parsed is
        actually XHTML. Changes the HTML 5 parsing algorithm to be more
        suitable for XHTML. In particular handles self-closed CDATA elements.
        So a ``<title/>`` or ``<style/>`` in the HTML will not completely break
        parsing. Also preserves namespaced tags and attributes even for namespaces
        not supported by HTML 5 (this works only with the ``lxml`` and ``lxml_html``
        treebuilder).
        Note that setting this also implicitly sets ``namespace_elements``.

    :param return_root: If True, return the root node of the document, otherwise
        return the tree object for the document.

    :param line_number_attr: The optional name of an attribute used to store the line number
        of every element. If set, this attribute will be added to each element with the
        element's line number.

    :param sanitize_names: Ensure tag and attributes contain only ASCII alphanumeric
        charactes, underscores, hyphens and periods. This ensures that the resulting
        tree is also valid XML. Any characters outside this set are replaced by
        underscores. Note that this is not strictly HTML 5 spec compliant, so turn it
        off if you need strict spec compliance.

    :param stack_size: The initial size (number of items) in the stack. The
        default is sufficient to avoid memory allocations for all but the
        largest documents.

    '''
    data = as_utf8(html or b'', transport_encoding, fallback_encoding)
    treebuilder = normalize_treebuilder(treebuilder)
    if treebuilder == 'soup':
        from .soup import parse
        return parse(data,
                     return_root=return_root,
                     keep_doctype=keep_doctype,
                     stack_size=stack_size)
    if treebuilder not in NAMESPACE_SUPPORTING_BUILDERS:
        namespace_elements = False

    capsule = html_parser.parse(data,
                                namespace_elements=namespace_elements
                                or maybe_xhtml,
                                keep_doctype=keep_doctype,
                                maybe_xhtml=maybe_xhtml,
                                line_number_attr=line_number_attr,
                                sanitize_names=sanitize_names,
                                stack_size=stack_size)

    interpreter = None
    if treebuilder == 'lxml_html':
        from lxml.html import HTMLParser
        interpreter = HTMLParser()
    ans = etree.adopt_external_document(capsule, parser=interpreter)
    if treebuilder in ('lxml', 'lxml_html'):
        return ans.getroot() if return_root else ans
    m = importlib.import_module('html5_parser.' + treebuilder)
    return m.adapt(ans, return_root=return_root)
Exemple #27
0
from contextlib import contextmanager

from lxml.etree import _Element
from lxml.html import (fromstring as _html_fromstring, tostring as
                       _html_tostring, Element, HtmlElement, HTMLParser)

__all__ = [
    'DoNotWriteBack', 'make_html_element', 'html_fromstring', 'html_tostring',
    'ctx_edit_html'
]

_PLATFORM_IS_WINDOWS = system() == 'Windows'
_HTML_DOCTYPE = b'<!DOCTYPE html>'
_XHTML_DOCTYPE = (b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" '
                  b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">')
_HTML_PARSER = HTMLParser(default_doctype=False)


class DoNotWriteBack(Exception):
    '''If changes do not require writing back to the file, 
    you can raise this exception'''


def _ensure_bytes(o):
    'Ensure the return value is `bytes` type'
    if isinstance(o, bytes):
        return o
    elif isinstance(o, str):
        return bytes(o, encoding='utf-8')
    else:
        return bytes(o)
 def get(self, url, html=False):
     self.browser.open(url)
     if html:
         return etree.fromstring(self.browser.contents, parser=HTMLParser())
     return self.browser.contents