def _get_version_filenames(self, session, chamber): '''All bills have "versions", but for those lacking html documents, the .wpd file is available via ftp. Create a dict of those links in advance; any bills lacking html versions will get version info from this dict.''' chamber_name = {'upper': 'senate', 'lower': 'House'}[chamber] ftp_url = 'ftp://www.legis.state.wv.us/publicdocs/%s/RS/%s/' ftp_url = ftp_url % (session, chamber_name) html = self.urlopen(ftp_url) dirs = [' '.join(x.split()[3:]) for x in html.splitlines()] split = re.compile(r'\s+').split matchwpd = re.compile(r'\.wpd$', re.I).search splitext = os.path.splitext version_filenames = collections.defaultdict(list) for d in dirs: url = ('%s%s/' % (ftp_url, d)).replace(' ', '%20') html = self.urlopen(url) filenames = [split(x, 3)[-1] for x in html.splitlines()] filenames = filter(matchwpd, filenames) for fn in filenames: fn, ext = splitext(fn) if ' ' in fn: bill_id, _ = fn.split(' ', 1) else: # One bill during 2011 had no spaces # in the filename. Probably a fluke. digits = re.search(r'\d+', fn) bill_id = fn[:digits.end()] version_filenames[bill_id.lower()].append((d, fn)) self._version_filenames = version_filenames
def _get_version_filenames(self, session, chamber): """All bills have "versions", but for those lacking html documents, the .wpd file is available via ftp. Create a dict of those links in advance; any bills lacking html versions will get version info from this dict.""" chamber_name = {"upper": "senate", "lower": "House"}[chamber] ftp_url = "ftp://www.legis.state.wv.us/publicdocs/%s/RS/%s/" ftp_url = ftp_url % (session, chamber_name) html = self.urlopen(ftp_url).decode("iso-8859-1") dirs = [" ".join(x.split()[3:]) for x in html.splitlines()] split = re.compile(r"\s+").split matchwpd = re.compile(r"\.wpd$", re.I).search splitext = os.path.splitext version_filenames = collections.defaultdict(list) for d in dirs: url = ("%s%s/" % (ftp_url, d)).replace(" ", "%20") html = self.urlopen(url).decode("iso-8859-1") filenames = [split(x, 3)[-1] for x in html.splitlines()] filenames = filter(matchwpd, filenames) for fn in filenames: fn, ext = splitext(fn) bill_id, _ = fn.split(" ", 1) version_filenames[bill_id.lower()].append((d, fn)) self._version_filenames = version_filenames
def get_cleaned_form_html(form, human_readable=True): """ Return a cleaned up version of <form> HTML contents. If ``human_readable`` is True, HTML is cleaned to make source code more readable for humans; otherwise it is cleaned to make rendered form more safe to render. """ params = dict( forms=False, javascript=True, scripts=True, remove_unknown_tags=False, ) if human_readable: params.update( style=True, allow_tags={'form', 'input', 'textarea', 'label', 'option', 'select', 'submit', 'a'}, ) else: params.update(style=False) cleaner = Cleaner(**params) raw_html = lxml.html.tostring(form, pretty_print=True, encoding="unicode") html = cleaner.clean_html(raw_html) if human_readable: lines = [line.strip() for line in html.splitlines(False) if line.strip()] html = "\n".join(lines) return html
def __parse(self, html): # remove xml decl and doctype, we will add the correct one before serializing # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html) # FIXME: do not remove doctype because we need it to load the dtd # remove xml declaration because of parser error: "Unicode # strings with encoding declaration are not supported. Please # use bytes input or XML fragments without declaration." re_xml_decl = re.compile(r'^.*?<\?xml.*?\?>', re.S | re.U) html = re_xml_decl.sub('', html) try: return etree.fromstring(html, lxml.html.XHTMLParser(huge_tree=True), base_url=self.attribs.url) except etree.ParseError as what: # cannot try HTML parser because we depend on correct xhtml namespace m = re.search(r"Entity '([^']+)'", str(what)) if m: warning("Missing entity: '%s'" % m.group(1)) else: error("Failed to parse file because: %s" % what) m = re.search(r'line\s(\d+),', str(what)) if m: lineno = int(m.group(1)) error("Line %d: %s" % (lineno, html.splitlines()[lineno - 1])) raise
def get_cleaned_form_html(form, human_readable=True): """ Return a cleaned up version of <form> HTML contents. If ``human_readable`` is True, HTML is cleaned to make source code more readable for humans; otherwise it is cleaned to make rendered form more safe to render. """ params = dict( forms=False, javascript=True, scripts=True, remove_unknown_tags=False, ) if human_readable: params.update( style=True, allow_tags={ 'form', 'input', 'textarea', 'label', 'option', 'select', 'submit', 'a' }, ) else: params.update(style=False) cleaner = Cleaner(**params) raw_html = lxml.html.tostring(form, pretty_print=True, encoding="unicode") html = cleaner.clean_html(raw_html) if human_readable: lines = [ line.strip() for line in html.splitlines(False) if line.strip() ] html = "\n".join(lines) return html
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id,)) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(' ', '') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') html = html.replace('>', '>') html = html.replace('<', '<') html = html.replace('&', '&') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def html2plaintext(html, body_id=None, encoding="utf-8"): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath("//*[@id=%s]" % (body_id,)) else: source = tree.xpath("//body") if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall(".//a"): url = link.get("href") if url: i += 1 link.tag = "span" link.text = "%s [%s]" % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(" ", "") html = html.replace("<strong>", "*").replace("</strong>", "*") html = html.replace("<b>", "*").replace("</b>", "*") html = html.replace("<h3>", "*").replace("</h3>", "*") html = html.replace("<h2>", "**").replace("</h2>", "**") html = html.replace("<h1>", "**").replace("</h1>", "**") html = html.replace("<em>", "/").replace("</em>", "/") html = html.replace("<tr>", "\n") html = html.replace("</p>", "\n") html = re.sub("<br\s*/?>", "\n", html) html = re.sub("<.*?>", " ", html) html = html.replace(" " * 2, " ") html = html.replace(">", ">") html = html.replace("<", "<") html = html.replace("&", "&") # strip all lines html = "\n".join([x.strip() for x in html.splitlines()]) html = html.replace("\n" * 2, "\n") for i, url in enumerate(url_index): if i == 0: html += "\n\n" html += ustr("[%s] %s\n") % (i + 1, url) return html
def step_HTML_matches_MD(context): with codecs.open(context.html_path, encoding='utf-8') as fi: html = context.html_text = normalize_html(fi.read()) assert context.translated_html_text == html, '\nDifferences:\n' + '\n'.join( difflib.context_diff([n.encode('ascii', 'replace') for n in context.translated_html_text.splitlines()], [n.encode('ascii', 'replace') for n in html.splitlines()], fromfile='Got', tofile='Expected'))
def _get_version_filenames(self, session, chamber): """All bills have "versions", but for those lacking html documents, the .wpd file is available via ftp. Create a dict of those links in advance; any bills lacking html versions will get version info from this dict.""" chamber_name = {"upper": "senate", "lower": "House"}[chamber] ftp_url = "ftp://www.legis.state.wv.us/publicdocs/%s/RS/%s/" ftp_url = ftp_url % (session, chamber_name) try: html = self.urlopen(ftp_url) except scrapelib.FTPError: # The url doesn't exist. Just set _version_filenames # to an empty dict. self._version_filenames = {} return dirs = [" ".join(x.split()[3:]) for x in html.splitlines()] split = re.compile(r"\s+").split matchwpd = re.compile(r"\.wpd$", re.I).search splitext = os.path.splitext version_filenames = collections.defaultdict(list) for d in dirs: url = ("%s%s/" % (ftp_url, d)).replace(" ", "%20") html = self.urlopen(url) filenames = [split(x, 3)[-1] for x in html.splitlines()] filenames = filter(matchwpd, filenames) for fn in filenames: fn, ext = splitext(fn) if " " in fn: bill_id, _ = fn.split(" ", 1) else: # One bill during 2011 had no spaces # in the filename. Probably a fluke. digits = re.search(r"\d+", fn) bill_id = fn[: digits.end()] version_filenames[bill_id.lower()].append((d, fn)) self._version_filenames = version_filenames
def decode_html(original_html): html = urllib2.urlopen(original_html).read() #将获取到的html源码分行,因为新浪微博将网页进行了压缩 lines = html.splitlines() for line in lines: if line.startswith( '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct","js":["apps' ): n = line.find('"html":"') if n > 0: decoded_html = line[n + 8:].encode("utf-8").decode( 'unicode_escape').encode("utf-8").replace("\\", "") return decoded_html
def print_form_html(form): """ Print a cleaned up version of <form> HTML contents """ cleaner = Cleaner( forms=False, javascript=True, scripts=True, style=True, allow_tags={'form', 'input', 'textarea', 'label', 'option', 'select', 'submit', 'a'}, remove_unknown_tags=False, ) html = cleaner.clean_html(lxml.html.tostring(form, pretty_print=True)) lines = [line.strip() for line in html.splitlines(False) if line.strip()] print("\n".join(lines))
def print_form_html(form): """ Print a cleaned up version of <form> HTML contents """ cleaner = Cleaner( forms=False, javascript=True, scripts=True, style=True, allow_tags={ 'form', 'input', 'textarea', 'label', 'option', 'select', 'submit', 'a' }, remove_unknown_tags=False, ) raw_html = lxml.html.tostring(form, pretty_print=True, encoding="unicode") html = cleaner.clean_html(raw_html) lines = [line.strip() for line in html.splitlines(False) if line.strip()] print("\n".join(lines))
def get_form_hash(form, only_visible=True): """ Return a string which is the same for duplicate forms, but different for forms which are not the same. If only_visible is True, hidden fields are not taken in account. """ if isinstance(form, six.string_types): form = lxml.html.fromstring(form) else: form = deepcopy(form) if only_visible: remove_by_xpath(form, "input[@type='hidden']") html = lxml.html.tostring(form, pretty_print=True, encoding="unicode") lines = [line.strip() for line in html.splitlines(False) if line.strip()] # return the whole string as a hash, for easier debugging return "\n".join(lines)
def __parse (self, html): # remove xml decl and doctype, we will add the correct one before serializing # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html) # FIXME: do not remove doctype because we need it to load the dtd # remove xml declaration because of parser error: "Unicode # strings with encoding declaration are not supported. Please # use bytes input or XML fragments without declaration." re_xml_decl = re.compile (r'^<\?xml.*?\?>', re.S) html = re_xml_decl.sub ('', html) try: return etree.fromstring ( html, lxml.html.XHTMLParser (), base_url = self.url) except etree.ParseError, what: # cannot try HTML parser because we depend on correct xhtml namespace error ("etree.fromstring says: %s" % what) m = re.search (r'line\s(\d+),', str (what)) if m: lineno = int (m.group (1)) error ("Line %d: %s" % (lineno, html.splitlines ()[lineno - 1])) raise
def _get_form_hash(form): # it just returns a full string as a hash, for easier debugging html = lxml.html.tostring(form, pretty_print=True, encoding="unicode") lines = [line.strip() for line in html.splitlines(False) if line.strip()] return "\n".join(lines)
def ct_session_info(): html = scrapelib.urlopen("ftp://ftp.cga.ct.gov") sessions = [line.split()[-1] for line in html.splitlines()] sessions.pop() # remove pub/ return sessions, sessions[-1]
nodes = sel(html) print len(nodes) for node in nodes: #print lxml.html.tostring(item) print node.get('href'), node.text 웹데이터-2: 웹파일 가져와서 자료구조에 넣기 import urllib2 url='http://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data' res=urllib2.urlopen(url) html = res.read() res.close() print len(html) lines=html.splitlines() data=[] for line in lines: data.append(line.split()) print len(data), len(data[0]) print data[0] 웹데이터-3: wiki에서 'python'으로 검색해서 http url출력하기 from urllib import urlopen keyword='python' resp = urlopen('https://www.google.com/search?q='+keyword) html=resp.read() len(html)