def html_to_etree(in_str, remove_blank_text=True): """ Parses a tree of possibly malformed HTML5, according to WHATWG HTML5 rules. Result is either: - parsed input, or; - if multiple fragments (> 1 top-level tags) are given: parsed input wrapped in either a `div' or `span', or; - None for empty input. """ if in_str is None: return None if not isinstance(in_str, basestring): raise ValueError('input must be a string') in_str = _nfc(in_str).strip() if not in_str: return None # Double-parse to remove (hopefully irrelevant) whitespace - some not-so-irrelevant whitespace will most likely be removed too etree = fromstring(in_str, parser=_html5Parser) # ATTENTION: tag/attributes namespace-info mangled here due to html5lib bugs. _etree_mutate_fix_localname(etree) if remove_blank_text: s = lxmltostring(etree) etree = parse(StringIO(s), parser=_xmlParser) etree = fromstring(lxmltostring(etree), parser=_html5Parser) _etree_mutate_fix_localname(etree) # and they spawn again after fromstring, so remove them again. return etree.getroot() if hasattr(etree, 'getroot') else etree
def run(self, staffName=None): self.getstaffdata() if staffName: if not self.config.has_key(u'名前'): self.config[u'名前'] = {} print "Recording " + staffName self.config[u'名前']['ja'] = staffName else: mypath = self.filename if not self.filename.startswith('http'): mypath = os.path.join(self.ls.baseurl, self.filename) ifh = urlopen(mypath) page = ifh.read() self.root = html5parser.fromstring(page) self.multiscrape(u'名前') self.multiscrape(u'ふりがな') self.multiscrape(u'役職') self.multiscrape(u'所属') self.multiscrape(u'所属学会') self.multiscrape(u'専門分野') self.multiscrape(u'研究テーマ') self.multiscrape(u'教員からのメッセージ') self.multiscrape(u'略歴') self.multiscrape(u'主要業績') self.multiscrape(u'リンク') applyNamePatches(self.config, self.key) self.output()
def build_html(): service = docs.setup_api_service() html = etree.fromstring(HTML_TEMPLATE) body = html.xpath("/x:html/x:body", namespaces=NSS)[0] the_google_docs = _collect_google_docs_inside_folder(service) first_doc_created_at = the_google_docs[0]['createdDate'] last_doc_created_at = the_google_docs[-1]['createdDate'] end_search_timestamp = _minutes_later(last_doc_created_at, num_minutes=5) tweet_search_results = tweets.hashtag_search_in_daterange(first_doc_created_at, end_search_timestamp) for i, child in enumerate(the_google_docs): log.info(u"Appending slide %s" % child['title']) slide = html5parser.fromstring(docs.export_file(service, child)) created_time = child['createdDate'] slide_body = slide.xpath("/x:html/x:body", namespaces=NSS)[0] the_slide_html_section = _generate_slide_html_section(slide_body, created_time, the_google_docs, i, tweet_search_results) body.append(the_slide_html_section) return html
def __init__(self, filename): StaffBase.__init__(self, filename) ConfigPaths.__init__(self) print "DOING ug: " + filename ifh = urlopen(self.ug.staffUrl(filename)) page = ifh.read() self.root = html5parser.fromstring(page)
def handle(self, *args, **options): path = os.path.dirname(__file__) page = html5parser.fromstring( open(path + '/supported_coins.html', 'r').read()) for coin_row in page.cssselect('.w-row.tdr'): try: icon = BASE_URL + coin_row.cssselect('.coin-logo')[0].get( 'src') name = coin_row.cssselect('.coin-link')[0].text code = coin_row.cssselect('.table-data')[1].text # Só cadastra moedas inexistentes if not Currencies.objects.filter(code=code).exists(): currency = Currencies() currency.name = name currency.code = code currency.save() # Faz o upload do icone response = requests.get(icon, headers=HEADERS) if response.status_code == 200: currency.icon.save(os.path.basename(icon), ContentFile(response.content), save=True) currency_gateway = CurrencyGateway() currency_gateway.currency = currency currency_gateway.gateway = 'coinpayments' currency_gateway.save() print('Cadastrando moeda {} -> {}'.format(name, code)) except Exception as e: continue
def get(self, url, follow_redir=True): req = webapp2.Request.blank(url) self.res = req.get_response(main.app) if len(self.res.body) > 0 and self.res.headers['content-type'].split(';')[0].strip() == 'text/html': self.tree = html5parser.fromstring(self.res.body, parser=self.parser) if follow_redir and self.res.status_code in [301, 302, 303, 304, 307] and 'location' in self.res.headers: self.get(self.res.headers['location'][16:])
def test_widget_empty(): f = I18nFormField(widget=I18nTextInput, required=False, localize=True) rendered = f.widget.render('foo', []) tree = html5parser.fromstring(rendered) assert tree[0].attrib == {'lang': 'de', 'name': 'foo_0', 'type': 'text'} assert tree[1].attrib == {'lang': 'en', 'name': 'foo_1', 'type': 'text'} assert tree[2].attrib == {'lang': 'fr', 'name': 'foo_2', 'type': 'text'}
def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass
def get(self, url, follow_redir=True): req = webapp2.Request.blank(url) self.res = req.get_response(main.app) if len(self.res.body) > 0 and self.res.headers['content-type'].split(';')[0].strip() == 'text/html': self.tree = html5parser.fromstring(self.res.body, parser=self.parser) if follow_redir and self.res.status_code in [301, 302, 303, 304, 307] and 'location' in self.res.headers: self.get(self.res.headers['location'][16:], follow_redir=True)
def get(self, path, data={}, follow=False): self.res = super(Browser, self).get(path, data, follow) if len(self.res.content) > 0 and self.res['content-type'].split(';')[0].strip() == 'text/html': self.tree = html5parser.fromstring(self.res.content, parser=self.parser) if follow and self.res.status_code in [301, 302, 303, 304, 307] and 'location' in self.res: self.get(self.res['location'][16:]) return self.res
def get_entries(self): r = requests.get(self.base_url, verify=False) assert r.status_code == 200 tree = html5parser.fromstring(r.text) d = [] for entry1, entry2, entry3 in group(tree.xpath(self.xpath_entry)[1:], 3): link = entry1.xpath(self.xpath_link) assert len(link) in (0, 1) if len(link) == 0: break title = entry1.xpath(self.xpath_title) assert len(title) == 1 score = entry2.xpath(self.xpath_score) assert len(score) in (0, 1) if len(score) == 0: continue link = urljoin(self.base_url, link[0]) title = title[0] score = int(score[0].split()[0]) d.append((title, score, link)) assert len(d) > 10 return d
def infections_ons(): res = requests.get( "https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/" "conditionsanddiseases/datasets/coronaviruscovid19infectionsurveydata") root = html5parser.fromstring(res.text) url = urljoin( "https://www.ons.gov.uk", root.find( './/{http://www.w3.org/1999/xhtml}a[@aria-label="Download Coronavirus' ' (COVID-19) Infection Survey: 2020 in xlsx format"]').get("href"), ) df = pd.read_excel(requests.get(url).content, sheet_name="2b", skiprows=6, skipfooter=10) df = df.drop(columns=[df.columns[4], df.columns[8]]) df.columns = [ "date", "incidence", "incidence_lower", "incidence_upper", "infections", "infections_lower", "infections_upper", "weekly", "weekly_lower", "weekly_upper", ] return df
def _get_content_preview(self): if self._content_preview is None: doc = html5parser.fromstring(self.content) desc = doc.xpath("string()").strip() desc = (desc[:38] + '..') if len(desc) > 40 else desc self._content_preview = desc.replace('\n', '') return self._content_preview
def getNameTitle(): response, content = h.request(URL+"?dt="+str(time.time()), 'POST') content = html5parser.fromstring(content) name = content.findall("*")[3].text title = content.findall("*")[5].text return name, title
async def test_hello_world(client): response = await client.get('/') assert_that(response.status, equal_to(200)) text = await response.text() value = html5parser.fromstring(text).xpath( './/*[@id="value"]/text()').pop() assert_that(value, value_matcher)
def _validate(self, url, validator, status_code=200): req = webapp2.Request.blank(url) res = req.get_response(main.app) self.assertEqual(status_code, res.status_code) if validator == 'html': self.assertEqual('text/html', res.content_type) html5parser.fromstring(res.body, parser=self.parser) elif validator == 'xml': self.assertEqual('text/xml', res.content_type) lxml.etree.XML(res.body) elif validator == 'json': self.assertEqual('application/json', res.content_type) json.loads(res.body) elif validator == 'text': self.assertEqual('text/plain', res.content_type)
def fetch_json_ld(url): res = requests.get(url) res.raise_for_status() root = html5parser.fromstring(res.text) return json.loads( root.find( './/{http://www.w3.org/1999/xhtml}script[@type="application/ld+json"]' ).text)
def find_category_string(html_text): parser = html5parser.fromstring(html_text) try: category_link = parser.cssselect('.category')[0] category_span = category_link.cssselect('[itemprop=genre]')[0] return category_span.text except IndexError as e: __log__.exception('Cannot match category in HTML') return None
def test_empty_result(self): query = "togetanansweryoufirsthavetoknowthequestion" r = requests.get(self.app_url + "search", params={'q': query}) # check response & redirects assert r.ok, "problematic status code: " + r.status_code # check contents html = html5parser.fromstring(r.content) # type: _Element assert html.cssselect("#result-failure") assert html.cssselect("#result-sidebar")
def _get_nhs_potential(title): url = ( "https://digital.nhs.uk/data-and-information/publications/statistical" "/mi-potential-covid-19-symptoms-reported-through-nhs-pathways-and-111-online/latest" ) text = requests.get(url).text et = html5parser.fromstring(text) el = et.find('.//{http://www.w3.org/1999/xhtml}a[@title="' + title + '"]') return el.get("href")
def test_widget_enabled_locales(): f = I18nFormField(widget=I18nTextInput, required=False) f.widget.enabled_locales = ['de', 'fr'] rendered = f.widget.render('foo', LazyI18nString({'de': 'Hallo', 'en': 'Hello'})) tree = html5parser.fromstring(rendered) assert tree[0].attrib == { 'lang': 'de', 'name': 'foo_0', 'type': 'text', 'value': 'Hallo' } assert tree[1].attrib == { 'lang': 'fr', 'name': 'foo_2', 'type': 'text' }
def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ if isinstance(s, six.text_type): s = s.encode('utf8') try: if html_too_big(s): return None return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass
def parse_icons_from_html(html): """Parsing an HTML document and return a list of rel icon links""" icons = [] htmlparsed = html5parser.fromstring(html) html_links = htmlparsed.xpath('//h:link[@rel]', namespaces={'h': 'http://www.w3.org/1999/xhtml'}) for html_link in html_links: attributes = html_link.attrib relvalues = attributes['rel'].lower() if relvalues in ('icon', 'apple-touch-icon', 'apple-touch-icon-precomposed'): icons.append(attributes) return icons
def parse_icons_from_html(html): """Parsing an HTML document and return a list of rel icon links""" icons = [] htmlparsed = html5parser.fromstring(html) html_links = htmlparsed.xpath( '//h:link[@rel]', namespaces={'h': 'http://www.w3.org/1999/xhtml'}) for html_link in html_links: attributes = html_link.attrib relvalues = attributes['rel'].lower() if relvalues in ('icon', 'apple-touch-icon', 'apple-touch-icon-precomposed'): icons.append(attributes) return icons
def _lxml_parse_document(self, body, use_html5lib=False, use_BeautifulSoup=False): if use_html5lib: from lxml.html import html5parser return html5parser.fromstring(body) elif use_BeautifulSoup: from lxml.html import soupparser return soupparser.fromstring(body) else: for parser in [ etree.XML, etree.HTML ]: try: return (parser(body)) except: pass
def update(self): r = requests.get(self.page_url, verify=False) assert r.status_code == 200 tree = html5parser.fromstring(r.text) self.link_scores = {} for article in tree.xpath("//*[local-name()='article']"): link = article.xpath(".//*[local-name()='span' and @class='anonymous_reader']/*[local-name()='a']/@href") assert len(link) == 1 score = article.xpath(".//*[local-name()='figure' and @class='score']/text()") assert len(score) == 1 self.link_scores[link[0]] = int(score[0]) super().update()
def run(self): self.getstaffdata() ifh = urlopen(os.path.join(self.ug.baseurl, self.filename)) page = ifh.read() self.root = html5parser.fromstring(page) self.multiscrape(u'名前') self.multiscrape(u'ふりがな') self.multiscrape(u'役職') self.multiscrape(u'専門分野') self.multiscrape(u'研究テーマ') self.multiscrape(u'学会') self.multiscrape(u'略歴') self.multiscrape(u'教員からのメッセージ') self.multiscrape(u'主要') applyNamePatches(self.config, self.key) self.output()
def html2text(html): """Render html as text, convert line breaks to spaces.""" if not ishtml(html): return re.sub(r'\s+', ' ', html.strip()) parser = html5parser.HTMLParser(namespaceHTMLElements=False) with warnings.catch_warnings(): warnings.simplefilter('ignore') html = html5parser.fromstring(html, parser=parser) for b in BLOCKTAGS: for e in html.xpath(f'//{b}'): e.text = ' ' + e.text if e.text else '' if len(e) > 0: lc = e[-1] lc.tail = (lc.tail if lc.tail else '') + ' ' else: e.text = e.text + ' ' text = tostring(html, method='text', encoding='utf-8') return re.sub(r'\s+', ' ', text.decode().strip())
def extract_next_links(rawDatas): global max_outlinks global max_outlink_url outputLinks = list() ''' rawDatas is a list of objs -> [raw_content_obj1, raw_content_obj2, ....] Each obj is of type UrlResponse declared at L28-42 datamodel/search/datamodel.py the return of this function should be a list of urls in their absolute form Validation of link via is_valid function is done later (see line 42). The frontier takes care of that. Suggested library: lxml ''' # cleaner = Cleaner(page_structure = False, links = False) # clean(remove) scripts, special tags, css style annotations, etc for raw_content_obj in rawDatas: if should_extract_urls(raw_content_obj): try: content = raw_content_obj.content # content = cleaner.clean_html(content) e = html5parser.fromstring(content) # Parse html5 content into element doc = html.fromstring( html.tostring(e)) # Weird workaround when using html5parser.from_string and html.fromstring # because they return different objects doc.make_links_absolute(raw_content_obj.url, resolve_base_href=True) link_count = 0 for e, a, l, p in doc.iterlinks(): # Get (element, attribute, link, pos) for every link in doc outputLinks.append(l) link_count += 1 # print l if (link_count > max_outlinks): max_outlinks = link_count max_outlink_url = raw_content_obj.url except etree.XMLSyntaxError as e: print "Error on url " + raw_content_obj.url + " " + str(e) raw_content_obj.bad_url = True return outputLinks
def grab_iconlink(html): """Parsing an HTML document and return a list of rel icon links""" htmlparsed = html5parser.fromstring(html) html_links = htmlparsed.xpath( '//h:link[@rel]', namespaces={'h': 'http://www.w3.org/1999/xhtml'}) for html_link in html_links: attributes = html_link.attrib relvalues = attributes['rel'].lower() if 'icon' in relvalues: if 'href' in attributes: iconlink = attributes['href'] else: iconlink = '' if 'sizes' in attributes: sizevalues = attributes['sizes'] else: sizevalues = '' icons.append((relvalues, sizevalues, iconlink)) return icons
def run(self): ifh = urlopen(self.ug.indexurl) page = ifh.read() ifh.close() doc = html5parser.fromstring(page) nodes = doc.xpath('//_:th/_:a[@href]', namespaces=NS) data = [] for node in nodes: data.append({ 'href': node.attrib['href'] }) #if len(data) == 0: #print page #print len(nodes) #print "*** Nothing found. Check the xpath against the source." for datum in data: if not datum['href'].endswith('.html'): continue staffscraper = Undergrad(datum['href']) staffscraper.run()
def grab_iconlink(html): """Parsing an HTML document and return a list of rel icon links""" htmlparsed = html5parser.fromstring(html) html_links = htmlparsed.xpath('//h:link[@rel]', namespaces={'h': 'http://www.w3.org/1999/xhtml'}) for html_link in html_links: attributes = html_link.attrib relvalues = attributes['rel'].lower() if 'icon' in relvalues: if 'href' in attributes: iconlink = attributes['href'] else: iconlink = '' if 'sizes' in attributes: sizevalues = attributes['sizes'] else: sizevalues = '' icons.append((relvalues, sizevalues, iconlink)) return icons
def add_post(request): if request.method == "POST": content = request.POST['content'] heading = request.POST['heading'] newpost = Post() newpost.writer = request.user newpost.heading = heading newpost.pub_date = timezone.now() #here we Use html5lib to convert an HTML fragment to plain text doc = html5parser.fromstring(content) newpost.content = doc.xpath("string()") newpost.save() else: return render(request, 'blogg/add_post.html') return render(request, 'blogg/index.html')
def clean_html(text, to_plaintext=False): if isinstance(text, str): text = unicode(text, 'utf-8') text = text.strip() if not len(text): return text import bleach ALLOWED_TAGS = bleach.ALLOWED_TAGS ALLOWED_TAGS.append('p') html = bleach.clean(text, tags=ALLOWED_TAGS, strip=True) from lxml.html import html5parser doc = html5parser.fromstring(html) plaintext = doc.xpath("string()") if plaintext == text: return plaintext return html
def run(self): ifh = urlopen(self.ls.indexurl) page = ifh.read() ifh.close() doc = html5parser.fromstring(page) for blockid in self.staffType.keys(): nodes = doc.xpath('//_:div[@id="%s"]//_:th//_:a' % blockid, namespaces=NS) data = [] for node in nodes: data.append({ 'href': node.attrib['href'], 'staffName': node.text, 'staffType': self.staffType[blockid] }) for datum in data: if datum['href'].endswith('.pdf'): staffscraper = Lawschool(datum) staffscraper.run(staffName=datum['staffName']) elif datum['href'].endswith('.html'): staffscraper = Lawschool(datum) staffscraper.run()
def test_query(self): query = "test" r = requests.get(self.app_url + "search", params={'q': query}) # check response & redirects assert r.ok, "problematic status code: " + r.status_code assert r.history[-1].status_code == 302, "you were not redirected" # check params params = parse_qs(urlparse(r.url).query) assert 'sid' in params, "no session-id was given!" assert 'step' in params, "the 'step' parameter is missing!" assert 'q' in params, "the query parameter is missing!" assert params['q'][0] == query # check contents html = html5parser.fromstring(r.content) # type: _Element assert html.cssselect("#result-list") assert len(html.cssselect("#result-list .result-entry")) == 10 assert html.cssselect("#search-input")[0].attrib['value'] == query assert len(html.cssselect("#topic-centroid-list .topic-item")) > 5 assert len(html.cssselect("#suggestions-list .result-entry")) == 10 js = list(html.iter('{*}script'))[0].text assert "const query" in js assert "const topicGraph" in js
def process_channel_message(self, source, target, msg): res = [] c = httplib2.Http() for word in msg.split(" "): word = word.split("#")[0] try: match = REG_EXP.match(word) if match: url = match.groups()[0] process = False for domain in self.domains: if fnmatch.fnmatch(urlparse.urlparse(url).netloc, domain): process = True break if process: resp, content = c.request(url) tree = html5parser.fromstring(str(content)) for element in tree.iter("{http://www.w3.org/1999/xhtml}title"): res.append(self.privmsg_response(target, "%s - \x0302\x1f%s\x0f" % (element.text, url))) except: pass return res
def httpfacts(url, now=None): if now is None: now = int(time.time()) req = requests.request('TRACE', url) yield ('http_trace_status-code', req.status_code) req = requests.request('OPTIONS', url) yield ('http_options_status-code', req.status_code) req = requests.request('GET', url) yield ('http_header_status-code', req.status_code) for k,v in req.headers.iteritems(): yield ('http_header_' + k, v) if k == 'date' or k == 'last-modified' or k == 'expires': ts = http_parse_date(v) yield ('http_header_' + k + '_timestamp', http_parse_date(v)) yield ('http_header_' + k + '_delta', ts - now) for k in req.cookies: """ Cookie(version=0, name='NID', value='67=Gx1dRLtNcYAkrXO8y3chpipEIQO4-PeWDuQ48QqjwZndKfJCQqvrdElss8vyDA_5wqohA1twy2te7xZqV9AvQVOmWElZJEcFu7VERLrr_tUlwlooIaDyb_UgqZy4zg-3', port=None, port_specified=False, domain='.google.co.jp', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1393118966, discard=False, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) """ prefix = 'http_cookie_' + k.name yield (prefix, k.value) yield (prefix + '_path', k.path) yield (prefix + '_domain', k.domain) yield (prefix + '_secure', k.secure) yield (prefix + '_expires', k.expires) yield (prefix + '_version', k.version) yield (prefix + '_rfc2109', k.rfc2109) httponly = k.has_nonstandard_attr('HttpOnly') yield (prefix + '_httponly', httponly) tree = html5parser.fromstring(req.text) #for el in tree.iterdescendants('meta'): XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" XHTML = "{%s}" % XHTML_NAMESPACE # really look for HMTL/HEAD yield ('html_root_tag', str(tree.tag)) #yield ('html_root1_tag', str(tree[0].tag)) # find HEAD element # /html/head[0] head = None for child in tree.iter(XHTML + 'head'): head = child break if head is None: return for meta in head.iterchildren(XHTML + 'meta'): if meta.get('name') and meta.get('content') is not None: yield 'html_meta_name_' + meta.get('name'), meta.get('content') elif meta.get('name') and meta.get('value') is not None: yield 'html_meta_name_' + meta.get('name'), meta.get('value') elif meta.get('property') and meta.get('content') is not None: yield 'html_meta_property_' + meta.get('property'), meta.get('content') elif meta.get('http-equiv') and meta.get('content') is not None: yield 'html_meta_http-equiv_' + meta.get('http-equiv').lower(), meta.get('content') elif meta.get('charset') is not None: yield 'html_meta_charset', meta.get('charset').lower() else: yield 'html_meta_unknown', tostring(meta)
def get_page(url): r = requests.get(url) assert r.status_code == 200 return html5parser.fromstring(r.text)
def httpfacts(url, now=None): if now is None: now = int(time.time()) req = requests.request('TRACE', url) yield ('http_trace_status-code', req.status_code) req = requests.request('OPTIONS', url) yield ('http_options_status-code', req.status_code) req = requests.request('GET', url) yield ('http_header_status-code', req.status_code) for k, v in req.headers.iteritems(): yield ('http_header_' + k, v) if k == 'date' or k == 'last-modified' or k == 'expires': ts = http_parse_date(v) yield ('http_header_' + k + '_timestamp', http_parse_date(v)) yield ('http_header_' + k + '_delta', ts - now) for k in req.cookies: """ Cookie(version=0, name='NID', value='67=Gx1dRLtNcYAkrXO8y3chpipEIQO4-PeWDuQ48QqjwZndKfJCQqvrdElss8vyDA_5wqohA1twy2te7xZqV9AvQVOmWElZJEcFu7VERLrr_tUlwlooIaDyb_UgqZy4zg-3', port=None, port_specified=False, domain='.google.co.jp', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1393118966, discard=False, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) """ prefix = 'http_cookie_' + k.name yield (prefix, k.value) yield (prefix + '_path', k.path) yield (prefix + '_domain', k.domain) yield (prefix + '_secure', k.secure) yield (prefix + '_expires', k.expires) yield (prefix + '_version', k.version) yield (prefix + '_rfc2109', k.rfc2109) httponly = k.has_nonstandard_attr('HttpOnly') yield (prefix + '_httponly', httponly) tree = html5parser.fromstring(req.text) #for el in tree.iterdescendants('meta'): XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" XHTML = "{%s}" % XHTML_NAMESPACE # really look for HMTL/HEAD yield ('html_root_tag', str(tree.tag)) #yield ('html_root1_tag', str(tree[0].tag)) # find HEAD element # /html/head[0] head = None for child in tree.iter(XHTML + 'head'): head = child break if head is None: return for meta in head.iterchildren(XHTML + 'meta'): if meta.get('name') and meta.get('content') is not None: yield 'html_meta_name_' + meta.get('name'), meta.get('content') elif meta.get('name') and meta.get('value') is not None: yield 'html_meta_name_' + meta.get('name'), meta.get('value') elif meta.get('property') and meta.get('content') is not None: yield 'html_meta_property_' + meta.get('property'), meta.get( 'content') elif meta.get('http-equiv') and meta.get('content') is not None: yield 'html_meta_http-equiv_' + meta.get( 'http-equiv').lower(), meta.get('content') elif meta.get('charset') is not None: yield 'html_meta_charset', meta.get('charset').lower() else: yield 'html_meta_unknown', tostring(meta)
def collect_web_data(domain): """Collects information after web site analysis""" results = {} url = 'http://' + domain try: (status, headers, realurl, data) = fetch_page(url) except KeyboardInterrupt: # except pycurl.error: return prepareError(ERROR_URLGETERROR, results) encoding = None # Default encoding is UTF-8 if 'content-type' in headers: ptypr, plist = parseContentType(headers['content-type']) if 'charset' in plist: results['page:enc:server'] = plist['charset'] encoding = results['page:enc:server'] resdata = { 'crawler:processed': datetime.datetime.now().isoformat(), 'page:data': zlib.compress(data, 9), 'page:headers': zlib.compress(repr(headers).encode('utf8'), 9) } try: encoding = 'utf-8' # edata = decode_html(data) #.decode(encoding, 'ignore') except KeyboardInterrupt: return prepareError(ERROR_ENCODING, results, resdata) try: p = fromstring(data) except KeyboardInterrupt: try: p = html5parser.fromstring(data) except KeyboardInterrupt: p = soupparser.fromstring(data) return prepareError(ERROR_PARSEERROR, results, resdata) # Setting basic properties results['site:url'] = url results['site:host'] = url_to_host(url) results['site:realurl'] = realurl results['site:realhost'] = url_to_host(realurl) results['page:status'] = status hders = [] for k, v in list(dict(headers).items()): try: hders.append({'name': k, 'value': v}) except KeyboardInterrupt: pass # Do nothing results['web:page:headers'] = hders # Process blocks of tags results['page:scripts'] = tags_to_array(p, tagname='script', attrlist=['type', 'src'], filter='src', distinct='src') results['page:images'] = tags_to_array( p, tagname='img', attrlist=['alt', 'title', 'width', 'height', 'src'], filter='src', distinct='src') results['page:meta'] = tags_to_array( p, tagname='meta', attrlist=['http-equiv', 'content', 'name', 'property'], filter=None, distinct=None) results['page:headlinks'] = tags_to_array( p, tagname='link', attrlist=['rel', 'type', 'title', 'href', 'media'], filter=None, distinct=None) results['page:iframes'] = tags_to_array(p, tagname='iframe', attrlist=['name', 'src'], filter=None, distinct=None) results['page:embeds'] = tags_to_array( p, tagname='embed', attrlist=['src', 'pluginspage', 'type'], filter=None, distinct=None) results['page:objects'] = tags_to_array( p, tagname='object', attrlist=['codetype', 'classid', 'code', 'codebase', 'type', 'data'], filter=None, distinct=None) results['page:forms'] = forms_to_array(p) results['page:applets'] = tags_to_array( p, tagname='applet', attrlist=['code', 'codebase', 'src', 'alt', 'title', 'name'], filter=None, distinct=None) results['page:inscripts'] = innerscripts_to_array(p) results['page:links'] = links_to_array(p) # Processing header keys if SERVER_KEY in list(headers.keys()): server = parse_server_header(headers[SERVER_KEY]) results['hdr:server'] = server if POWERED_BY_KEY in list(headers.keys()): s = parse_powered_by(headers[POWERED_BY_KEY]) results['hdr:poweredby'] = s return (results, resdata)
def repair_html(html_str): parser = html5parser.HTMLParser(namespaceHTMLElements=False) parsed = html5parser.fromstring(html_str, guess_charset=False, parser=parser) return html.tostring(parsed, encoding='unicode')
def text_html5parse(text, encoding='unicode'): node = html5parser.fromstring(text, parser=_html5parser) etree = node.getroottree() parsed = tostring(etree, encoding=encoding) return parsed
def query(self, path): html = self.res.body p = html5parser.fromstring(html, parser=self.parser) xml = ET.fromstring(tostring(p)) path = re.sub(r'/(\w+\d?)', r'/{http://www.w3.org/1999/xhtml}\1', path) return xml.findall(path)
def __init__(self, html): tree = html5parser.fromstring(html) namespace = {'html': 'http://www.w3.org/1999/xhtml'} # Extract company name (mandatory) xpath = './/html:span[@class="tag-name"]/text()' name_text = tree.xpath(xpath, namespaces=namespace) if name_text: self.name = name_text[0] else: raise ZS_ParserParseError("Unable to get name text (can't find '{0}')".format(xpath)) # Extract comapny address (mandatory) xpath = './/html:address/text()' address_text = tree.xpath(xpath, namespaces=namespace) if address_text: self.address = html2text(' '.join(address_text))[:-2] else: raise ZS_ParserParseError("Unable to get address text (can't find '{0}')".format(xpath)) # Extract comapny information (free) information_text = tree.xpath('.//html:p[@class="tag-description"]/text()', namespaces=namespace) if information_text: self.information = html2text(' '.join(information_text))[:-2] else: self.information = None # Extract categories (free) xpath = './/html:span[@class="tag-categories"]/text()' categories_text = tree.xpath(xpath, namespaces=namespace) if categories_text: self.categories = categories_text else: self.categories = None # Extract branches (free) branches_tree = tree.xpath('.//html:div[@class="row branch ptb20"]', namespaces=namespace) branches = [] for branch_tree in branches_tree: id_json = branch_tree.xpath('.//html:a/@data-td', namespaces=namespace) try: branches.append(loads(id_json[0])['id']) except (IndexError, ValueError, KeyError): raise ZS_ParserParseError("Unable to get branch data-td json") if branches: self.branches = branches else: self.branches = None # Load other data (free except at least one main contact) self.contact_persons = None self.products = None self.marks = None self.services = None self.keywords = None self.special_offer = None self.payments = None sections = tree.xpath('.//html:section[@class="profile-body"]/html:section', namespaces=namespace) for section in sections: try: attrib_class = section.attrib['class'] except KeyError: continue # Main and other contacts if 'contacts' in attrib_class: try: xpath = 'html:h3/text()' heading = section.xpath(xpath, namespaces=namespace)[0] except IndexError: raise ZS_ParserParseError("Unable to get heading of profile-body contacts section text (can't find '{0}')".format(xpath)) # Main contacts if heading == 'Kontaktné údaje': rows = section.xpath('.//html:li[@class="dt w100"]', namespaces=namespace) for row in rows: xpath = './/html:span[@class="title dtc w30"]/text()' row_title_text = row.xpath(xpath, namespaces=namespace) row_title = html2text(' '.join(row_title_text))[:-2] xpath = './/html:span[@class="dtc w70"]//text()' contact_text = row.xpath(xpath, namespaces=namespace) if not contact_text: raise ZS_ParserParseError("Unable to get contact from main contacts row '{0}' text (can't find '{1}')".format(row_title, xpath)) if not self.contacts: self.contacts = [] self.contacts.append((row_title, html2text(' '.join(contact_text))[:-2])) else: raise ZS_ParserParseError("Unable to get main contacts rows tree (can't find {0}')".format(xpath)) # Other contacts if heading == 'Kontaktné osoby': rows = section.xpath('.//html:li[@class="dt w100"]', namespaces=namespace) for row in rows: xpath = './/html:span[@class="title dtc w50"]/text()' row_title_text = row.xpath(xpath, namespaces=namespace) row_title = html2text(' '.join(row_title_text))[:-2] contacts = [] xpath = './/html:span[@class="dtc w50"]/html:span' contacts_tree = row.xpath(xpath, namespaces=namespace) for contact_tree in contacts_tree: contacts.append(html2text(' '.join(contact_tree.xpath('.//text()')))[:-2]) else: raise ZS_ParserParseError("Unable to get contacts from contact persons row '{0}' text (can't find {1}')".format(row_title, xpath)) if not self.contact_persons: self.contact_persons = [] self.contact_persons.append((html2text(' '.join(row_title_tree[0].xpath('text()')))[:-2], contacts)) else: raise ZS_ParserParseError("Unable to get contact persons rows tree (can't find {0}')".format(xpath)) if not hasattr(self, 'contacts'): raise ZS_ParserParseError("Unable to get at least one main contact from page") # Products if 'products' in attrib_class: xpath = './/html:li[@itemprop="name"]/text()' products_texts = section.xpath(xpath, namespaces=namespace) if products_texts: self.products = products_texts else: raise ZS_ParserParseError("Unable to get products texts (can't find '{0}')".format(xpath)) # Marks if 'marks' in attrib_class: xpath = './/html:li[@itemprop="name"]/text()' marks_texts = section.xpath(xpath, namespaces=namespace) if marks_texts: self.marks = marks_texts else: raise ZS_ParserParseError("Unable to get marks texts (can't find '{0}')".format(xpath)) # Services if 'services' in attrib_class: xpath = './/html:li[@itemprop="name"]/text()' services_texts = section.xpath(xpath, namespaces=namespace) if services_texts: self.services = services_texts else: raise ZS_ParserParseError("Unable to get services texts (can't find '{0}')".format(xpath)) # Keywords if 'kw' in attrib_class: xpath = './/html:span[@itemprop="name"]/text()' keywords_texts = section.xpath(xpath, namespaces=namespace) if keywords_texts: self.keywords = keywords_texts else: raise ZS_ParserParseError("Unable to get keywords texts (can't find '{0}')".format(xpath)) # Special offers if 'special-offer' in attrib_class: xpath = './/html:p/text()' special_offer_text = section.xpath(xpath, namespaces=namespace) if special_offer_text: self.special_offer = html2text(' '.join(special_offer_text))[:-2] else: raise ZS_ParserParseError("Unable to get special offer text (can't find '{0}')".format(xpath)) # Payments if 'payments' in attrib_class: xpath = './/html:li' payment_headings_tree = section.xpath(xpath, namespaces=namespace) for payment_heading_tree in payment_headings_tree: xpath = './/html:img/text()' payment_heading_text = section.xpath(xpath, namespaces=namespace) else: raise ZS_ParserParseError("Unable to get payment headings tree (can't find {0}')".format(xpath)) # Gallery if 'gallery' in attrib_class: pass # Documents if 'documents' in attrib_class: pass # Certificates if 'certificates' in attrib_class: pass # Video if 'video' in attrib_class: pass # Virtual tour link if 'virtual-tour' in attrib_class: pass # Reviews if 'reviews-box' in attrib_class: pass
def call_it(self, *args, **kwargs): from lxml.html.html5parser import fromstring return fromstring(*args, **kwargs)
def lxml_etree_from_response(response): text = response.body_as_unicode() node = html5parser.fromstring(text, parser=_html5parser) etree = node.getroottree() return etree
# -*- coding: koi8-r -*- import urllib2 def separatewords(text): splitter = re.compile(' ') return [s.lower() for s in splitter.split(text) if s != ''] page = 'http://www.newizv.ru/culture/2012-09-26/170402-novym-hudrukom-masterskoj-petra-fomenko-stanet-evgenij-kamenkovich.html' c = urllib2.urlopen(page) content = c.read() encoding = c.headers['content-type'].split('charset=')[-1] unicontent = unicode(content, encoding) from lxml.html import html5parser doc = html5parser.fromstring(unicontent) str = doc.xpath("string()") print str #from lxml import html #doc = html.fromstring(content) import codecs f = codecs.open('text3.txt', encoding='utf-8', mode='w+') sepa = separatewords(str) print sepa import unicodedata #str2 = repr(str.decode('unicode-escape')) #sepa2 = separatewords(str2) #print sepa2 for elem in sepa: