def on_loaded(self): warn = self.document.xpath('//div[@id="message_renouvellement_mot_passe"]') if len(warn) > 0: raise BrowserIncorrectPassword(warn[0].text) # load content of loading divs. divs = [] for div in self.document.xpath('//div[starts-with(@id, "as_")]'): loading = div.xpath('.//span[@class="loading"]') if len(loading) == 0: continue input = div.xpath('.//input')[0] divs.append([div, input.attrib['name']]) if len(divs) > 0: args = {} for i, (div, name) in enumerate(divs): args['key%s' % i] = name args['div%s' % i] = div.attrib['id'] args['time'] = 0 r = self.browser.openurl(self.browser.buildurl('/AsynchAjax', **args)) data = json.load(r) for i, (div, name) in enumerate(divs): html = data['data'][i]['flux'] div.clear() div.insert(0, etree.fromstring(html, parser=etree.HTMLParser()))
def load_async(self, time): total = 0 restart = True while restart: restart = False # load content of loading divs. lst = self.doc.xpath('//input[@type="hidden" and starts-with(@id, "asynch")]') if len(lst) > 0: params = {} for i, input in enumerate(lst): params['key%s' % i] = input.attrib['name'] params['div%s' % i] = input.attrib['value'] params['time'] = time r = self.browser.open('/AsynchAjax', params=params) data = json.loads(r.content) for i, d in enumerate(data['data']): div = self.doc.xpath('//div[@id="%s"]' % d['key'])[0] html = d['flux'] div.clear() div.attrib['id'] = d['key'] # needed because clear removes also all attributes div.insert(0, etree.fromstring(html, parser=etree.HTMLParser())) if 'time' in data: wait = float(data['time'])/1000.0 self.logger.debug('should wait %f more seconds', wait) total += wait if total > 120: raise BrowserUnavailable('too long time to wait') sleep(wait) restart = True
def query_initial_packages(search_term): """ Perform an initial package search on PyPI with the given :attr:`search_term`, and return a list of :attr:`PypiSearchResult` named objects. :param str search_term: The initial search query :return: The list of search results :rtype: list[PypiSearchResult] """ logging.info("Querying initial packages for %s...", search_term) result_page = requests.get("https://pypi.python.org/pypi", params={ ":action": "search", "term": search_term }) result_tree = etree.fromstring(result_page.content, HTMLParser()) result_tree.make_links_absolute(result_page.url) result_tags = result_tree.xpath("//table[@class='list']/tr[@class][td]") results = [] for lxml_element in result_tags: result_obj = PypiJsonSearchResult(link="{0}/json".format( lxml_element[0][0].get("href")), weight=int(lxml_element[1].text), summary=lxml_element[2].text or '') if result_obj.is_pip_result(search_term): results.append(result_obj) return results
def load_async(self, time): # load content of loading divs. lst = self.document.xpath( '//input[@type="hidden" and starts-with(@id, "asynch")]') if len(lst) > 0: params = {} for i, input in enumerate(lst): params['key%s' % i] = input.attrib['name'] params['div%s' % i] = input.attrib['value'] params['time'] = time r = self.browser.openurl( self.browser.buildurl('/AsynchAjax', **params)) data = json.load(r) for i, d in enumerate(data['data']): div = self.document.xpath('//div[@id="%s"]' % d['key'])[0] html = d['flux'] div.clear() div.attrib['id'] = d[ 'key'] # needed because clear removes also all attributes div.insert(0, etree.fromstring(html, parser=etree.HTMLParser())) if 'time' in data: sleep(float(data['time']) / 1000.0) return self.load_async(time)
def _string_is_valid_html(self, string): try: a = ETH.fromstring(string).find('.//*') is not None self.logger.debug('_string_is_valid_html: The string is recognized as HTML') except ET.XMLSyntaxError as e: self.logger.debug('_string_is_valid_html: The string is NOT recognized as HTML') a = False return a
def fetch_winpython_lib_page(): """ Fetch the Windows Python compiled libraries page and return the parsed element tree. """ resp = requests.get(WINPYTHON_LIBS_URL, timeout=30) tree = etree.fromstring(resp.content, HTMLParser()) tree.make_links_absolute(resp.url) return tree
def TO_HTML(cls, r, c, x): '''Gets the HTML tree from an HTML string. Args: x (str ): The html source. Returns: lxml.html.HtmlElement : HTML element. ''' return etree.fromstring(x)
def test_widget_attrs(self): widget = self.widget_class('FkModelAutocomplete', widget_attrs={'data-widget-foo': 'bar', 'class':'foobar'}) html = widget.render('somewidget', None) et = etree.fromstring(html) self.assertEquals(et.attrib['data-widget-foo'], 'bar') self.assertIn('foobar', et.attrib['class']) self.assertIn('autocomplete-light-widget', et.attrib['class'])
def test_value_out_of_queryset(self): widget = self.widget_class('ItemAutocomplete') html = widget.render('somewidget', [1, 2]) span = etree.fromstring(html) choices = CSSSelector('[data-value]')(span) self.assertEqual(len(choices), 1) self.assertEqual(int(choices[0].attrib['data-value']), 1)
def get_params(): '''get form parameters for session use''' r = _session.get(URL, headers={'User-Agent': _UA}) tree = etree.fromstring(r.text, etree.HTMLParser()) # Get all input tags params = { x.attrib['name']: x.attrib.get('value', '') for x in tree.xpath('.//input') } return r.text, params
def get_links(html): parser = etree.HTMLParser() try: tree = etree.fromstring(html, parser=parser) except XMLSyntaxError as ex: return [] if tree is None: return [] links = tree.xpath('//a/@href') return links
def process_page(sterile_page, target_url): """ Process the page so all the links has it's text wrapped in <em></em> and all the words that are longer than 4 symbols are wrapped in <strong></strong> :param sterile_page: A string, target page's source stripped from all the tags, but <a></a> :param target_url: A string, an URL which user gave us :return: A string, processed page ready to render in template """ # Parse the inbound page into element tree with lxml root = etree.fromstring(sterile_page) # First, let's deal with <a></a> for a_tag in root.xpath(".//a"): # If <a></a> has some text in it if a_tag.text and a_tag.text.strip(): # Create new element <em></em>, assign the text from <a></a> to it, delete the text from <a></a>, # and insert <em></em> element instead em = etree.Element('em') em.text = a_tag.text a_tag.text = None a_tag.insert(0, em) # While we are at it, let's fix all the broken relative links we got from page source # #crutch_alert try: # If it works, we don't need to do anything with the a_tag's href valid = URLValidator() valid(a_tag.attrib['href']) except ValidationError: # Good chances are, that this malformed url is _relative_ to target url's domain a_tag.attrib['href'] = absolutize_url( schemeful_domain(target_url), a_tag.attrib['href']) else: # If <a></a> is empty (e.g., after removing an image from anchor's text), remove it altogether with hrefs. a_tag.getparent().remove(a_tag) # Take every element in the tree and traverse the tree, checking if it has text in it # If it does, inflict reinforce_text() which will wrap the words in <strong></strong> if they are longer than 4 for element in root.iter(): if element.text and element.text.strip(): element.text = reinforce_text(element.text) if element.tail and element.tail.strip(): element.tail = reinforce_text(element.tail) # The final bit: flatten the modified tree back to string, decode it and then unescape everything what was escaped # (< and > in <strong></strong>) return unescape(etree.tostring(root, method='html').decode())
def test_widget_attrs(self): widget = self.widget_class("FkModelAutocomplete", widget_attrs={"class": "foo"}) html = widget.render("somewidget", None) et = etree.XML(html) self.assertIn("foo", et.attrib["class"]) # This was originally masked from the test suite because method # definition was repeated widget = self.widget_class("FkModelAutocomplete", widget_attrs={"data-widget-foo": "bar", "class": "foobar"}) html = widget.render("somewidget", None) et = etree.fromstring(html) self.assertEquals(et.attrib["data-widget-foo"], "bar") self.assertIn("foobar", et.attrib["class"]) self.assertIn("autocomplete-light-widget", et.attrib["class"])
def get_links(html): parser = etree.HTMLParser() try: tree = etree.fromstring(html, parser=parser) except XMLSyntaxError as ex: log.warn('html parsing error') return [] if tree is None: log.warn("html not parsed") return [] links = tree.xpath('//a/@href') return links
def open_in_browser(self): if sys.platform == "win32": try: td_td_html = self.to_html() if not td_td_html: return table = """<table border="1" cellspacing="0">{}</table>""".format( td_td_html) open_in_browser(etree.fromstring(table)) except: print("open_in_browser ERROR!") else: print("open in browser success!") else: print("{}不支持此功能".format(sys.platform))
def export_schedule(self, out_file=None): e_html = etree.Element('html') e_head = etree.SubElement(e_html, 'head') e_encoding = etree.SubElement(e_head, 'meta', charset="utf-8") if self.options.get('html_title', False): title = self.options['html_title'] else: title = self.schedule.name e_title = etree.SubElement(e_head, 'title') e_title.text = title e_style = etree.SubElement(e_head, 'style', type='text/css') e_style.text = css e_body = etree.SubElement(e_html, 'body') e_h1 = etree.SubElement(e_body, 'h1') e_h1.text = title if self.options.get('html_table_header', False): e_body.append(etree.fromstring(self.options['html_table_header'])) e_table = etree.SubElement(e_body, 'table', attrib={ 'align': 'center', 'class': 'schedule' }) e_tr_head = etree.SubElement(e_table, 'tr') head_columns = ['HierarchIndex', 'Name', 'Start', 'End', 'Duration'] for column in head_columns: e_th_head = etree.SubElement(e_tr_head, 'th') e_th_head.text = column for index, task in enumerate(self.schedule.tasks): self._export_task(e_table, task, index + 1) etree_return = etree.ElementTree(e_html) if out_file: etree_return.write(out_file, pretty_print=True, encoding="utf-8", xml_declaration=False) return str(etree_return)
def test_widget_attrs(self): widget = self.widget_class('FkModelAutocomplete', widget_attrs={'class': 'foo'}) html = widget.render('somewidget', None) et = etree.XML(html) self.assertIn('foo', et.attrib['class']) # This was originally masked from the test suite because method # definition was repeated widget = self.widget_class('FkModelAutocomplete', widget_attrs={'data-widget-foo': 'bar', 'class':'foobar'}) html = widget.render('somewidget', None) et = etree.fromstring(html) self.assertEquals(et.attrib['data-widget-foo'], 'bar') self.assertIn('foobar', et.attrib['class']) self.assertIn('autocomplete-light-widget', et.attrib['class'])
def post_filter(self, args): title = args[0].split('[[')[-1].split(']]')[0].split('|')[-1] if title.strip(): title = title.strip() text = args[1] counts = {} doc = etree.fromstring(text, etree.HTMLParser()) hids = [] toc_html = '<div id="toc" class="table_of_contents"><h3>%s</h3>\n'%(title) for node in doc.xpath('//h1|//h2|//h3|//h4|//h5'): if node.tag.lower() == 'h1': this_depth = 0 elif node.tag.lower() == 'h2': this_depth = 1 elif node.tag.lower() == 'h3': this_depth = 2 elif node.tag.lower() == 'h4': this_depth = 3 elif node.tag.lower() == 'h5': this_depth = 4 else: continue p = re.compile('[^a-zA-Z0-9\s\_]') this_id = re.sub(p, '-', node.text).replace(' ','-') if this_id in hids: counts[this_id] = counts.get(this_id, 0) + 1 this_id = '%s-%s'%(this_id, counts[this_id]) hids.append(this_id) pat = '%s'%(etree.tostring(node)) rep = '<%s id="%s" class="toc_heading">%s'\ '<span class="toc_top"><a href="#toc">↩</a></span></%s>'\ '<p style="clear: both;"></p>'\ %(node.tag, this_id, node.text, node.tag) text = text.replace(pat, rep, 1) indent_px = this_depth * 20 toc_html += '<p style="margin-left: %spx">+ '\ '<a href="#%s">%s</a></p>\n'\ %(indent_px, this_id, node.text) toc_html += '</div>\n' text = text.replace(text,toc_html+text) return text
def from_youdao(query: str): try: html = urlopen("https://www.youdao.com/w/eng/%s/#keyfrom=dict2.index" % (quote(query), )) bs_obj = BeautifulSoup(html, 'html.parser') html_str = bs_obj.prettify() html_str = re.split("(</html>)", html_str) html_str = html_str[0] + html_str[1] root = et.fromstring(html_str) ns = {"default": root.nsmap[None]} xpath = ".//*[contains(@id, 'phrsListTab')]//*[contains(@class, 'container')]//default:li//text()" li_text = root.xpath(xpath, namespaces=ns) meaning = [utils_text_preprocess.clean_text(li) for li in li_text] except BaseException as e: print(e) meaning = [] return meaning
def main(args=None): """ Get the IP of the machine calling this function. :return str: The IP of the caller """ resp = requests.get("http://www.ip-details.com", timeout=5) resp.raise_for_status() tree = etree.fromstring(resp.content, HTMLParser()) tree.make_links_absolute(resp.url) ipAddrText = tree.xpath("//div/h1[@class]/text()") try: return ipAddrText[0].split(":")[-1].strip() except Exception as e: print >> sys.stderr, "Error parsing URL content at {0!r}".format( resp.url) raise e
def apply_update(self, page_content): """ From the given page content, parse and add the download statistics to this search result. """ tree = etree.fromstring(page_content, HTMLParser()) counts = tree.xpath( "//ul[@class='nodot'][li[strong[starts-with(text(), 'Downloads')]]]/li/span/text()" ) self.download_counts = [float(count) for count in counts] last_update = tree.xpath( "//table[@class='list']/tr[@class]/td[4]/text()") if last_update not in [None, []]: self.last_update = dateutil.parser.parse(last_update[0], ignoretz=True) return True self.last_update = None return False
def set_node_note_by_id(self, node_id, note_text): """ :param node_id: :param note_text: :return: """ node = self.get_node_by_id(node_id) if node is None: raise self.FreeplaneNodeNotExisting else: if self.get_node_note_by_id(node_id) is None: self.logger.debug('set_node_note_by_id: No Note find under {0}. Creating one now...'.format(node_id)) else: self.logger.debug('set_node_note_by_id: Note exist under {0} and will override it'.format(node_id)) richcontent_node = node.find(self.T_RICHCONTENT) node.remove(richcontent_node) del richcontent_node richcontent_node = ET.SubElement(node, self.T_RICHCONTENT) richcontent_node.set(self.A_TYPE, self.V_TYPE_NOTE) if self._string_is_valid_html(note_text): local_html_doc = ETH.fromstring(note_text) else: # Raw text is crashing freeplane. Will try to wrap the note in an HTML document local_html_doc = ET.Element('html') head = ET.SubElement(local_html_doc, 'head') body = ET.SubElement(local_html_doc, 'body') # Sanitize: Remove rogue bracket < > note_text = note_text.replace('&', '&') note_text = note_text.replace('<', '<') note_text = note_text.replace('>', '>') data = '<p>%s</p>' % note_text.replace('\n', '<br />') p = ET.fromstring(data) body.append(p) richcontent_node.insert(1, local_html_doc)
def filter( self, html: str, inline: bool = False, outgoing: bool = False, display_name_mentions: Optional[Dict[str, str]] = None, ) -> str: """Filter and return HTML.""" mentions = display_name_mentions sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions)) html = sanit.sanitize(html).rstrip("\n") if not html.strip(): return html tree = etree.fromstring( html, parser=etree.HTMLParser(encoding="utf-8"), ) for a_tag in tree.iterdescendants("a"): self._mentions_to_matrix_to_links(a_tag, mentions, outgoing) if not outgoing: self._matrix_to_links_add_classes(a_tag) html = etree.tostring(tree, encoding="utf-8", method="html").decode() html = sanit.sanitize(html).rstrip("\n") if outgoing: return html # Client-side modifications html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html) if not inline: return html return self.inline_quote_regex.sub( r'\1<span class="quote">\2</span>', html, )
def add_latest_date_from_ftp_page(self, page_content): """ From the given page content, parse and add the latest date listed. """ tree = etree.fromstring(page_content, HTMLParser()) xpath_arg = "//a[@href][starts-with(., '{0}')]".format(self.name) link_elems = tree.xpath(xpath_arg) max_date = datetime.min for elem in link_elems: date_size_parts = (elem.tail or "").strip().split() if not date_size_parts: continue date_str = " ".join(date_size_parts[:-1]) date_val = dateutil.parser.parse(date_str, ignoretz=True) # If parser returns default date, it's most likely an error, so skip over it. default_date = datetime.combine(datetime.now().date(), dt_time.min) if date_val == default_date: continue max_date = max(date_val, max_date) self.last_update = max_date
def load_async(self, time): # load content of loading divs. lst = self.document.xpath('//input[@type="hidden" and starts-with(@id, "asynch")]') if len(lst) > 0: params = {} for i, input in enumerate(lst): params['key%s' % i] = input.attrib['name'] params['div%s' % i] = input.attrib['value'] params['time'] = time r = self.browser.openurl(self.browser.buildurl('/AsynchAjax', **params)) data = json.load(r) for i, d in enumerate(data['data']): div = self.document.xpath('//div[@id="%s"]' % d['key'])[0] html = d['flux'] div.clear() div.attrib['id'] = d['key'] # needed because clear removes also all attributes div.insert(0, etree.fromstring(html, parser=etree.HTMLParser())) if 'time' in data: sleep(float(data['time'])/1000.0) return self.load_async(time)
def search_rpm_page(self, page=1): url = 'http://rpm.pbone.net/index.php3' cookie_dict = { 'cookie_lang': '2', 'cookie_srodzaj': '4', 'cookie_dl': '100', 'cookie_simple': '1', 'cookies_accepted': 'T' } post_data = { 'stat': 3, 'search': self.search_term, 'simple': 1, 'srodzaj': 4, 'limit': page } with TimeoutContext(5): resp = self.session.post(url, data=post_data, cookies=cookie_dict, timeout=(5, 21)) tree = etree.fromstring(resp.content, HTMLParser()) tree.make_links_absolute(resp.url) return tree
def parse_index(html): # fromstring Parses an XML document or fragment from a string. doc = etree.fromstring(html) pid = doc.xpath('//ROW/@PTID') return pid
import requests from lxml.html import etree site_body = requests.get('http://afisha.yandex.by/minsk') html = """\ analysis = [site_body.text][0] root = etree.fromstring(html) print(root.xpath('//span[@class="card_info_inner"]')[0].text) print(analysis)
from urllib.request import urlopen from urllib.parse import quote from bs4 import BeautifulSoup from lxml.html import etree as et if __name__ == "__main__": html = urlopen("https://www.youdao.com/w/eng/saw/#keyfrom=dict2.index") bs_obj = BeautifulSoup(html, 'html.parser') html_str = bs_obj.prettify() root = et.fromstring(html_str)