def pre_parse(self): http_content_type = self.response.headers.get('content-type', '') target = HTMLEncodings(http_content_type) # parser will fail on non-ascii unless we set it explicitly parser = HTMLParser(target=target, encoding='ISO-8859-1') total_bytes = 0 while target: chunk = self.response.read(PRE_PARSE_CHUNK_SIZE) if not chunk: try: parser.close() except XMLSyntaxError: pass break if self.bom is None: assert PRE_PARSE_CHUNK_SIZE >= 4 self.bom = b'' for i in range(4, 1, -1): if chunk[:i] in BOM_ENC: self.bom = chunk[:i] target.encodings.append(('bom', BOM_ENC[self.bom])) # the can only be one BOM - stop here break parser.feed(chunk) total_bytes += len(chunk) if total_bytes >= MAX_PRE_PARSE_BYTES: break return target.encodings
def parse_html(html, encoding="utf8"): if not html: return html if type(html) != unicode: html = html.decode(encoding) try: html_obj = etree.XML(html) except: try: parser = HTMLParser() parser.feed(html) html_obj = parser.close() except: try: html_obj = etree.HTML(html) except: html_obj = soupparser.fromstring(html) return html_obj
def _slice(resp: str, index: int = 1) -> GoogleResponse: utf8_parser = HTMLParser(encoding="utf-8") d = PyQuery(fromstring(resp, parser=utf8_parser)) data = d.find(".g") pages = list(d.find("td").items())[1:-1] return GoogleResponse(data, pages, index)
.replace(' \u2022 ', '\xa0\u2022 ') # add nbsp before bullet .replace('\xad', '') # remove tab ) drop_a = lambda el: [ a.drop_tag() for a in el.xpath("descendant::a|descendant::img") ] def remove_attr(el): for key in el.attrib: del el.attrib[key] parser = HTMLParser(encoding='cp1251', remove_blank_text=True, remove_comments=True) def books_from_category(cat, catalog): a = catalog.xpath("//a[@name='{}']".format(cat['slug']))[0] # noqa head_tr = a.xpath('ancestor::tr[1]')[0] next_head_tr = head_tr.xpath( "following-sibling::tr[@bgcolor='#333399'][1]") if len(next_head_tr) > 0: next_head_tr = next_head_tr[0] return intersection( catalog, # noqa set1=head_tr.xpath("following-sibling::tr[descendant::a]"), set2=next_head_tr.xpath("preceding-sibling::tr[descendant::a]"))
import logging from StringIO import StringIO from lxml.html import HTMLParser, parse from scrapy.http import HtmlResponse logger = logging.getLogger() _HTML_PARSER = HTMLParser(encoding='utf8') class HtmlParser(object): def __call__(self, response): if not isinstance(response, HtmlResponse): return response.html = parse( StringIO(response.body_as_unicode().encode('utf8')), _HTML_PARSER)
from ._compat import ( to_bytes, to_unicode, unicode, unicode_compatible, ) from .utils import ( cached_property, ignored, ) logger = logging.getLogger("breadability") TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*")) UTF8_PARSER = HTMLParser(encoding="utf8") CHARSET_META_TAG_PATTERN = re.compile( br"""<meta[^>]+charset=["']?([^'"/>\s]+)""", re.IGNORECASE) def decode_html(html): """ Converts bytes stream containing an HTML page into Unicode. Tries to guess character encoding from meta tag of by "chardet" library. """ if isinstance(html, unicode): return html match = CHARSET_META_TAG_PATTERN.search(html) if match: declared_encoding = match.group(1).decode("ASCII")
import requests url = 'http://www.safekorea.go.kr/idsiSFK/neo/sfk/cs/sfc/dis/disasterMsgView.jsp?menuSeq=679' response = requests.get(url).text from lxml.html import fromstring, tostring, HTMLParser lxml_response = fromstring(response, parser=HTMLParser(encoding="utf-8")) print(lxml_response) lxml_table = lxml_response.xpath('//*[@id="cn"]/text()[1]') print(lxml_table)
def _make_etree(html,url,charset="UTF-8",absolute_links=True): from lxml.html import HTMLParser, document_fromstring parser = HTMLParser(encoding="UTF-8") root = document_fromstring(html, parser=parser, base_url=url) x = fetch_images(root)
def make_etree(html, url): parser = HTMLParser(encoding='UTF-8') root = document_fromstring(html, parser=parser, base_url=url) return root
def the_aaas_tango(pma, verify=True): '''Note: "verify" param recommended here (page navigation usually necessary). :param: pma (PubMedArticle object) :param: verify (bool) [default: True] :return: url (string) :raises: AccessDenied, NoPDFLink ''' #try: # pma = rectify_pma_for_vip_links(pma) # pdfurl = aaas_format.format(ja=aaas_journals[pma.journal]['ja'], a=pma) #except NoPDFLink: # try the pmid-based approach baseurl = 'http://www.sciencemag.org/cgi/pmidlookup?view=long&pmid=%s' % pma.pmid res = requests.get(baseurl) pdfurl = res.url.replace('.long', '.full') + '.pdf' if not verify: return pdfurl response = requests.get(pdfurl) if response.status_code == 200 and response.headers['content-type'].find( 'pdf') > -1: return response.url elif response.status_code == 200 and response.headers['content-type'].find( 'html') > -1: tree = etree.fromstring(response.content, HTMLParser()) if not tree.find('head/title').text.find('Sign In') > -1: raise NoPDFLink( 'TXERROR: AAAS returned unexpected HTML response for url %s' % (pdfurl)) else: # some items are acquirable via free account registration... but let's not mess with this just yet. raise NoPDFLink( 'DENIED: AAAS paper subscription-only or requires site registration (url: %s)' % pdfurl) form = tree.cssselect('form')[0] fbi = form.fields.get('form_build_id') baseurl = urlsplit(response.url) post_url = baseurl.scheme + '://' + baseurl.hostname + form.action payload = { 'pass': AAAS_PASSWORD, 'name': AAAS_USERNAME, 'form_build_id': fbi, 'remember_me': 1 } print("SUBMITTING TO AAAS") print(payload) response = requests.post(post_url, data=payload) if response.status_code == 403: return AccessDenied( 'DENIED: AAAS subscription-only paper (url: %s)') elif response.headers['content-type'].find('pdf') > -1: return response.url elif response.headers['content-type'].find('html') > -1: #if response.content.find('access-denied') > -1: raise NoPDFLink('DENIED: AAAS subscription-only paper (url: %s)' % pdfurl) else: raise NoPDFLink('TXERROR: AAAS returned %s for url %s' % (response.status_code, pdfurl))
def parse(self, parser=None, base_url=None): """Parses the underlying html source using `lxml` library. This parsed tree is stored in :attr:`root` of this object. which could be used to perform numerous operations. Returns ------- ElementTree """ utx = self._get_utx() assert utx is not None, "UrlTransformer not Implemented." # internal error assert utx.base_path is not None, "Base Path is not set!" assert utx.base_url is not None, "Base url is not Set!" if not isinstance(parser, HTMLParser): TypeError("Expected instance of <%r>, got <%r>" % (HTMLParser, parser)) if not parser: parser = HTMLParser(encoding=self.encoding, collect_ids=False) source = self.get_source() assert source is not None, "Source is not Set!" assert hasattr(source, 'read'), "File like object is required!" # assert self._element_factory is not None # assert hasattr(self._element_factory, 'make_element') LOGGER.info( 'Parsing tree with source: <%r> encoding <%s> and parser <%r>' % (self._source, self.encoding, parser)) context_tree = lxml_parse(source, parser=parser, base_url=base_url) # The tree generated by the parse is stored in the self.root # variable and can be utilised further for any number of use cases self._tree = context_tree self.root = context_tree.getroot() # if self.root is not None: # # WaterMarking :) # self.root.insert(0, Comment(MARK.format('', __version__, utx.url, utc_now(), ''))) # There are internal links present on the html page which are files # that includes `#` and `javascript:` and 'data:base64;` type links # or a simple `/` url referring anchor tag # thus these links needs to be left as is. factory = getattr(self, 'make_element', None) assert callable(factory), "Element generator is not callable!" # Modify the tree elements for el in context_tree.iter(): # A element can contain multiple urls for pack in self._handle_lxml_elem(el): if pack is not None: elem, attr, url, pos = pack else: # pragma: no cover continue if elem is not None: o = factory(elem, attr, url, pos) if o is not None: self._stack.append(o) self._parseComplete = True return self.root
def _cleaned_html_tree(html: str) -> HtmlElement: parser = HTMLParser(encoding='utf8') tree = fromstring(html.encode('utf8'), parser=parser) return _clean_html(tree)
def search_filings(self): search_form = {'datePostedStart': datetime.strftime(self.start_date, '%m/%d/%Y'), 'datePostedEnd': datetime.strftime(self.end_date, '%m/%d/%Y')} _search_url_rgx = re.compile(r"window\.open\('(.*?)'\)", re.IGNORECASE) search_params = {'event': 'processSearchCriteria'} for filing_type in self.filing_types: _form = search_form.copy() _form['reportType'] = filing_type['code'] _form.update(search_params) self.debug('making request with {f}'.format(f=_form)) _, response = self.urlretrieve( self.base_url, method='POST', body=_form, ) d = etree.fromstring(response.text, parser=HTMLParser()) results = d.xpath('//*[@id="searchResults"]/tbody/tr') if len(results) >= 3000: error_msg = "More than 3000 results for params:\n{}".format( json.dumps(search_params, indent=2)) raise Exception(error_msg) # we're going to skip duplicate submissions on the same day results_seen = [] for result in results: filing_type = result.xpath('td[3]')[0].text registrant_name = result.xpath('td[1]')[0].text client_name = result.xpath('td[2]')[0].text filing_date = result.xpath('td[5]')[0].text # this is how we define duplicates result_key = (registrant_name, client_name, filing_type, filing_date) if result_key not in results_seen: try: m = re.search(_search_url_rgx, result.attrib['onclick']) except KeyError: self.error('element {} has no onclick attribute'.format( etree.tostring(result))) try: _doc_path = m.groups()[0] except AttributeError: self.error('no matches found for search_rgx') self.debug('\n{r}\n{a}\n{u}'.format( r=_search_url_rgx.pattern, a=result.attrib['onclick'], u=response.request.url )) _params = dict(parse_qsl( urlparse(_doc_path).query)) if _params: results_seen.append(result_key) yield _params else: self.error('unable to parse {}'.format( etree.tostring(result))) else: continue
def description_checks(): args = parser.parse_args() # set up a basic logging config logger = logging.getLogger() if args.verbose == 1: logger.setLevel(logging.INFO) elif args.verbose >= 2: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.ERROR) files_to_check = [] for f in args.file: if os.path.basename(f).startswith('DESCRIPTION.'): files_to_check.append(f) else: fb.error("'{}' is not a DESCRIPTION file.".format(f)) continue if len(files_to_check) == 0: fb.error("None of the specified files " "seem to be valid DESCRIPTION files.") exit(-1) for f in files_to_check: try: contents = open(f).read() except: fb.error("File '{}' does not exist.".format(f)) continue # --------------------------------------------------------------------- fb.new_check("Does DESCRIPTION file contain broken links ?") doc = defusedxml.lxml.fromstring(contents, parser=HTMLParser()) broken_links = [] for link in doc.xpath('//a/@href'): try: response = requests.head(link) if response.status_code != requests.codes.ok: broken_links.append(link) except requests.exceptions.RequestException: broken_links.append(link) if len(broken_links) > 0: fb.error(("The following links are broken" " in the DESCRIPTION file:" " '{}'").format("', '".join(broken_links))) else: fb.ok("All links in the DESCRIPTION file look good!") # --------------------------------------------------------------------- fb.new_check("Is this a propper HTML snippet ?") contenttype = magic.from_file(f) if "HTML" not in contenttype: data = open(f).read() if "<p>" in data and "</p>" in data: fb.ok(("{} is a propper" " HTML snippet.").format(f)) else: fb.error(("{} is not a propper" " HTML snippet.").format(f)) else: fb.ok("{} is a propper HTML file.".format(f)) # --------------------------------------------------------------------- fb.new_check("DESCRIPTION.en_us.html is more than 200 bytes ?") statinfo = os.stat(f) if statinfo.st_size <= 200: fb.error("{} must have size larger than 200 bytes".format(f)) else: fb.ok("{} is larger than 200 bytes".format(f)) # --------------------------------------------------------------------- fb.new_check("DESCRIPTION.en_us.html is less than 1000 bytes ?") statinfo = os.stat(f) if statinfo.st_size >= 1000: fb.error("{} must have size smaller than 1000 bytes".format(f)) else: fb.ok("{} is smaller than 1000 bytes".format(f)) fb.save_json_report()
def parse_from_string(s): parser = HTMLParser() tree = etree.fromstring(s, parser=parser) return parse_tree(tree)
def parse_file(fn): parser = HTMLParser() tree = etree.parse(fn, parser=parser).getroot() return parse_tree(tree)
from lxml.html import parse, HTMLParser import scraperwiki parser = HTMLParser(encoding='utf-8') page = parse('http://www.sescsp.org.br/sesc/programa_new/busca.cfm', parser).getroot() for box in page.cssselect('#box'): if box.cssselect('.tit a'): evento = { 'title': box.cssselect('.tit a')[0].text_content(), 'url': box.cssselect('.tit a')[0].get('href') } scraperwiki.sqlite.save(['url'], evento) from lxml.html import parse, HTMLParser import scraperwiki parser = HTMLParser(encoding='utf-8') page = parse('http://www.sescsp.org.br/sesc/programa_new/busca.cfm', parser).getroot() for box in page.cssselect('#box'): if box.cssselect('.tit a'): evento = { 'title': box.cssselect('.tit a')[0].text_content(), 'url': box.cssselect('.tit a')[0].get('href') } scraperwiki.sqlite.save(['url'], evento)
def _slice(resp: str) -> IqdbResponse: utf8_parser = HTMLParser(encoding="utf-8") d = PyQuery(fromstring(resp, parser=utf8_parser)) return IqdbResponse(d)
def _make_tree(self, fstring): root = etree.fromstring( fstring, parser=HTMLParser(encoding=get_encoding(fstring))) return root
def scrape_super_pacs(self): def reformat_date(datestring): dt = datetime.strptime(datestring, '%m/%d/%Y') return dt.strftime('%Y-%m-%d') def find_the_table(html_document): return html_document.xpath( '//*[@id="ctl00_ctl00_MainPanel"]/table')[0] def separate_name_and_address(cell): name = cell.text address = ', '.join([br.tail for br in cell.xpath('br')]) return name, address def scrape_table(table_element): scraped_rows = [] for row in table_element.xpath('tr'): _data = {} columns = row.xpath('td') if len(columns) == 5: _data['org_id'] = columns[0].text_content() _name, _address = separate_name_and_address(columns[1]) _data['org_name'] = _name _data['org_address'] = _address _data['org_phone'] = columns[2].text_content() _data['org_begin_date'] = reformat_date( columns[3].text_content()) _data['org_end_date'] = reformat_date( columns[4].text_content()) scraped_rows.append(_data) return scraped_rows PAC_LIST_URL = "http://apps.azsos.gov/apps/election/cfs/search/SuperPACList.aspx" tmp, resp = self.urlretrieve(PAC_LIST_URL) html_document = etree.fromstring(resp.content, parser=HTMLParser()) target_table = find_the_table(html_document) results = scrape_table(target_table) for result in results: _org = Organization(name=result['org_name'], classification='political action committee', founding_date=result['org_begin_date'], dissolution_date=result['org_end_date']) _org.add_identifier(identifier=result['org_id'], scheme='urn:az-state:committee') _org.add_contact_detail(type='address', value=result['org_address']) _org.add_contact_detail(type='voice', value=result['org_phone']) _org.add_source(url=PAC_LIST_URL) _org.source_identified = True yield _org
import codecs import os.path import re from datetime import datetime import js2py import pyquery import requests from lxml.html import HTMLParser, fromstring from tzlocal import get_localzone from libs.comic_downloader import Host from .ComicInfo import ComicInfo from .StreamLine import * UTF8_PARSER = HTMLParser(encoding='utf-8') def reverse(queue): left = 0 right = queue_length(queue) - 1 while left < right: swap(queue, left, right) left += 1 right -= 1 class DmedenHost(Host): LOCAL_TZ = get_localzone() BASE_URL = "http://www.dmeden.com/"
def append_html_table(self, html_string, start_row=1, start_col=1): html_string = document_fromstring(html_string, HTMLParser(encoding='utf8')) last_row = start_row - 1 last_col = start_col for table_el in html_string.xpath('//table'): last_row += 1 for row_i, row in enumerate(table_el.xpath('./tr'), start=last_row): for col_i, col in enumerate(row.xpath('./td|./th'), start=last_col): colspan = int(col.get('colspan', 0)) rowspan = int(col.get('rowspan', 0)) font_bold = False font_size = 11 font_color = BLACK_COLOR if rowspan: rowspan -= 1 if colspan: colspan -= 1 col_data = col.text_content().encode("utf8") valign = 'center' if col_i == start_col and col.tag != 'th' else 'top' while (row_i, col_i) in self.list: col_i += 1 cell = self.worksheet.cell(row=row_i, column=col_i) if rowspan or colspan: self.worksheet.merge_cells(start_row=row_i, end_row=row_i + rowspan, start_column=col_i, end_column=col_i + colspan) cell.value = col_data cell.alignment = Alignment( horizontal=row.get('align', col.get('align')) or 'left', vertical=row.get('valign', col.get('valign')) or valign, shrink_to_fit=True, wrap_text=True) bgcolor = row.get('bgcolor', col.get('bgcolor')) if bgcolor: cell.fill = PatternFill(fill_type='solid', start_color=bgcolor, end_color=bgcolor) for el in col.iter(): if el.tag == 'font': font_color = el.get('color') elif el.tag == 'b': font_bold = True elif el.tag in _TEXT_SIZE: font_bold = True, font_size = _TEXT_SIZE.get(el) cell.font = Font( color=font_color, bold=font_bold, size=font_size, ) if col.tag == 'th': cell.font = Font(bold=True) cell.fill = PatternFill(fill_type='solid', start_color=TH_COLOR, end_color=TH_COLOR) for i in range(0, rowspan + 1, 1): for j in range(0, colspan + 1, 1): if i == rowspan: last_row = row_i + i self.list.append((row_i + i, col_i + j)) cell = self.worksheet.cell(row=row_i + i, column=col_i + j) cell.border = Border( left=Side(border_style=_BORDER_STYLE.get( table_el.get('border') or None), color=BORDER_COLOR), right=Side(border_style=_BORDER_STYLE.get( table_el.get('border') or None), color=BORDER_COLOR), top=Side(border_style=_BORDER_STYLE.get( table_el.get('border') or None), color=BORDER_COLOR), bottom=Side(border_style=_BORDER_STYLE.get( table_el.get('border') or None), color=BORDER_COLOR), ) return last_row, last_col
from .settings import MAX_FILE_SIZE, MIN_FILE_SIZE LOGGER = logging.getLogger(__name__) UNICODE_ALIASES = {'utf-8', 'utf_8'} urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) RETRY_STRATEGY = urllib3.util.Retry( total=3, connect=0, status_forcelist=[429, 500, 502, 503, 504], ) HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY) HTML_PARSER = HTMLParser(collect_ids=False, default_doctype=False, encoding='utf-8', remove_pis=True) def isutf8(data): """Simple heuristic to determine if a bytestring uses standard unicode encoding""" try: data.decode('UTF-8') except UnicodeDecodeError: return False else: return True def detect_encoding(bytesobject): """Read all input or first chunk and return a list of encodings"""
def make_etree(html, url): from lxml.html import HTMLParser, document_fromstring parser = HTMLParser(encoding="UTF-8") root = document_fromstring(html, parser=parser, base_url=url) return root
def _make_tree(self, fstring): root = etree.fromstring(fstring, parser=HTMLParser(encoding=get_encoding(fstring, guesses='utf-8', is_html=True))) return root
def parse(html, transport_encoding=None, namespace_elements=False, treebuilder='lxml', fallback_encoding=None, keep_doctype=True, maybe_xhtml=False, return_root=True, line_number_attr=None, sanitize_names=True, stack_size=16 * 1024): ''' Parse the specified :attr:`html` and return the parsed representation. :param html: The HTML to be parsed. Can be either bytes or a unicode string. :param transport_encoding: If specified, assume the passed in bytes are in this encoding. Ignored if :attr:`html` is unicode. :param namespace_elements: Add XML namespaces when parsing so that the resulting tree is XHTML. :param treebuilder: The type of tree to return. Note that only the lxml treebuilder is fast, as all other treebuilders are implemented in python, not C. Supported values are: * `lxml <https://lxml.de>`_ -- the default, and fastest * `lxml_html <https://lxml.de>`_ -- tree of lxml.html.HtmlElement, same speed as lxml * etree (the python stdlib :mod:`xml.etree.ElementTree`) * dom (the python stdlib :mod:`xml.dom.minidom`) * `soup <https://www.crummy.com/software/BeautifulSoup>`_ -- BeautifulSoup, which must be installed or it will raise an :class:`ImportError` :param fallback_encoding: If no encoding could be detected, then use this encoding. Defaults to an encoding based on system locale. :param keep_doctype: Keep the <DOCTYPE> (if any). :param maybe_xhtml: Useful when it is unknown if the HTML to be parsed is actually XHTML. Changes the HTML 5 parsing algorithm to be more suitable for XHTML. In particular handles self-closed CDATA elements. So a ``<title/>`` or ``<style/>`` in the HTML will not completely break parsing. Also preserves namespaced tags and attributes even for namespaces not supported by HTML 5 (this works only with the ``lxml`` and ``lxml_html`` treebuilder). Note that setting this also implicitly sets ``namespace_elements``. :param return_root: If True, return the root node of the document, otherwise return the tree object for the document. :param line_number_attr: The optional name of an attribute used to store the line number of every element. If set, this attribute will be added to each element with the element's line number. :param sanitize_names: Ensure tag and attributes contain only ASCII alphanumeric charactes, underscores, hyphens and periods. This ensures that the resulting tree is also valid XML. Any characters outside this set are replaced by underscores. Note that this is not strictly HTML 5 spec compliant, so turn it off if you need strict spec compliance. :param stack_size: The initial size (number of items) in the stack. The default is sufficient to avoid memory allocations for all but the largest documents. ''' data = as_utf8(html or b'', transport_encoding, fallback_encoding) treebuilder = normalize_treebuilder(treebuilder) if treebuilder == 'soup': from .soup import parse return parse(data, return_root=return_root, keep_doctype=keep_doctype, stack_size=stack_size) if treebuilder not in NAMESPACE_SUPPORTING_BUILDERS: namespace_elements = False capsule = html_parser.parse(data, namespace_elements=namespace_elements or maybe_xhtml, keep_doctype=keep_doctype, maybe_xhtml=maybe_xhtml, line_number_attr=line_number_attr, sanitize_names=sanitize_names, stack_size=stack_size) interpreter = None if treebuilder == 'lxml_html': from lxml.html import HTMLParser interpreter = HTMLParser() ans = etree.adopt_external_document(capsule, parser=interpreter) if treebuilder in ('lxml', 'lxml_html'): return ans.getroot() if return_root else ans m = importlib.import_module('html5_parser.' + treebuilder) return m.adapt(ans, return_root=return_root)
from contextlib import contextmanager from lxml.etree import _Element from lxml.html import (fromstring as _html_fromstring, tostring as _html_tostring, Element, HtmlElement, HTMLParser) __all__ = [ 'DoNotWriteBack', 'make_html_element', 'html_fromstring', 'html_tostring', 'ctx_edit_html' ] _PLATFORM_IS_WINDOWS = system() == 'Windows' _HTML_DOCTYPE = b'<!DOCTYPE html>' _XHTML_DOCTYPE = (b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">') _HTML_PARSER = HTMLParser(default_doctype=False) class DoNotWriteBack(Exception): '''If changes do not require writing back to the file, you can raise this exception''' def _ensure_bytes(o): 'Ensure the return value is `bytes` type' if isinstance(o, bytes): return o elif isinstance(o, str): return bytes(o, encoding='utf-8') else: return bytes(o)
def get(self, url, html=False): self.browser.open(url) if html: return etree.fromstring(self.browser.contents, parser=HTMLParser()) return self.browser.contents