def test_process_row(self): fragment = html.fragment_fromstring(""" <tr> <td>One</td> <td>Two</td> <td>Three</td> </tr> """) user_is_anonymous = html.fragment_fromstring(""" <tr> <td class='vertTh'><center>some text</center></td> <td> <div class='detName'><a href='/torrent/some_torrent' class='detLink'>some torrent</a></div> <a href="..." title='Download this torrent'>...</a> <a href="..." title='Download this torrent using magnet'>...</a> <a href="/user/<some_user>" title="Browse <some_user>">some_user</a> <img src="..." /> <font class='detDesc'> Uploaded 08-10 15:57, Size 1.03 GiB, ULed by </font> </td> <td align='right'>123</td> <td align='right'>321</td> </tr> """) p1, p2 = self._create_parsers(fragment) self.assert_(p2.process_row(fragment) == None) self.assertRaises(exceptions.InvalidRow, p1.process_row, fragment) self.assertEqual(p1.process_row(user_is_anonymous)["user"], "Anonymous") self.assertEqual(p2.process_row(user_is_anonymous)["user"], "Anonymous")
def transform_misused_divs_into_paragraphs(self): for elem in self.tags(self.html, 'div'): # transform <div>s that do not contain other block elements into # <p>s #FIXME: The current implementation ignores all descendants that # are not direct children of elem # This results in incorrect results in case there is an <img> # buried within an <a> for example if not REGEXES['divToPElementsRe'].search( str_(b''.join(map(tostring, list(elem))))): #log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) for elem in self.tags(self.html, 'div'): if elem.text and elem.text.strip(): p = fragment_fromstring('<p/>') p.text = elem.text elem.text = None elem.insert(0, p) #print "Appended "+tounicode(p)+" to "+describe(elem) for pos, child in reversed(list(enumerate(elem))): if child.tail and child.tail.strip(): p = fragment_fromstring('<p/>') p.text = child.tail child.tail = None elem.insert(pos + 1, p) #print "Inserted "+tounicode(p)+" to "+describe(elem) if child.tag == 'br': #print 'Dropped <br> at '+describe(elem) child.drop_tree()
def transform_misused_divs_into_paragraphs(self): for elem in self.tags(self.html, 'div'): # transform <div>s that do not contain other block elements into <p>s if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))): #self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" #print "Fixed element "+describe(elem) for elem in self.tags(self.html, 'div'): if elem.text and elem.text.strip(): p = fragment_fromstring('<p/>') p.text = elem.text elem.text = None elem.insert(0, p) #print "Appended "+tounicode(p)+" to "+describe(elem) for pos, child in reversed(list(enumerate(elem))): if child.tail and child.tail.strip(): p = fragment_fromstring('<p/>') p.text = child.tail child.tail = None elem.insert(pos + 1, p) #print "Inserted "+tounicode(p)+" to "+describe(elem) if child.tag == 'br': #print 'Dropped <br> at '+describe(elem) child.drop_tree()
def get_lyrics(self): element = self.element # Replace <br> tags with \n (prepend it with \n and then remove all # occurrences of <br>) for br in element.cssselect('br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(element, 'br', with_tail=False) # Remove unneeded tags bad_tags = element.cssselect('.rtMatcher') + \ element.cssselect('.lyricsbreak') for tag in bad_tags: tag.drop_tree() # Remove HTML comments real_string = etree.tostring(element, encoding=unicode) cleaned_html = clean_html(real_string) # -KMS Modification- # Add try/except block to prevent script from crashing when # run from applescript try: print u'{0}'.format( html.fragment_fromstring(cleaned_html).text_content() ).encode('utf-8').strip() except UnicodeError: print u'{0}'.format( html.fragment_fromstring(cleaned_html).text_content() ).encode('utf-8').strip() return 0
def transform_misused_divs_into_paragraphs(self): """ Transforms <div> without other block elements into <p>, merges near-standing <p> together. """ for elem in self.tags(self._html, 'div'): # transform <div>s that do not contain other block elements into # <p>s # FIXME: The current implementation ignores all descendants that are not direct children of elem # This results in incorrect results in case there is an <img> buried within an <a> for example if not REGEXES['divToPElementsRe'].search(tostring(elem).decode()): elem.tag = "p" for elem in self.tags(self._html, 'div'): if elem.text and elem.text.strip(): p = fragment_fromstring('<p/>') p.text = elem.text elem.text = None elem.insert(0, p) for pos, child in reversed(list(enumerate(elem))): if child.tail and child.tail.strip(): p = fragment_fromstring('<p/>') p.text = child.tail child.tail = None elem.insert(pos + 1, p) if child.tag == 'br': child.drop_tree()
def test_class_hits(): """If the class is in the list then it gets a weight""" test_div = '<div class="something post">Content</div>' node = fragment_fromstring(test_div) assert get_class_weight(node) == 25 test_div = '<div class="something comments">Content</div>' node = fragment_fromstring(test_div) assert get_class_weight(node) == -25
def test_is_unlikely(self): "Keywords in the class/id will make us believe this is unlikely." test_div = '<div class="something comments">Content</div>' node = fragment_fromstring(test_div) self.assertTrue(is_unlikely_node(node)) test_div = '<div id="comments">Content</div>' node = fragment_fromstring(test_div) self.assertTrue(is_unlikely_node(node))
def test_not_unlikely(self): """Suck it double negatives.""" test_div = '<div id="post">Content</div>' node = fragment_fromstring(test_div) self.assertFalse(is_unlikely_node(node)) test_div = '<div class="something post">Content</div>' node = fragment_fromstring(test_div) self.assertFalse(is_unlikely_node(node))
def test_class_hits(self): """If the class is in the list then it gets a weight""" test_div = '<div class="something post">Content</div>' node = fragment_fromstring(test_div) self.assertEqual(get_class_weight(node), 25) test_div = '<div class="something comments">Content</div>' node = fragment_fromstring(test_div) self.assertEqual(get_class_weight(node), -25)
def _summary(self, enclose_with_html_tag=True): # the first page parsed into a elementree element doc = self.html # the set of urls we've processed so far parsed_urls = set() url = self.options.get("url", None) if url is not None: parsed_urls.add(url) # check the current doc for a next page if requested if self.options.get("multipage", False): next_page_url = find_next_page_url(parsed_urls, url, doc) page_0 = get_article(doc, self.options) page_0_doc = fragment_fromstring(page_0.html) page_index = 0 make_page_elem(page_index, page_0_doc) if enclose_with_html_tag: output = document_fromstring("<div/>") output.getchildren()[0].attrib["id"] = "article" output.getchildren()[0].append(page_0_doc) else: output = fragment_fromstring("<div/>") output.attrib["id"] = "article" output.append(page_0_doc) if next_page_url is not None: append_next_page(parsed_urls, page_index + 1, next_page_url, output, self.options) return Summary( tostring(output), page_0.confidence, short_title=shorten_title(output), title=get_title(output), description=get_description(output), keywords=get_keywords(output), ) summary = get_article(doc, self.options, enclose_with_html_tag=enclose_with_html_tag) print(len(summary.html), "============================") if summary.title == "[something-wrong]" or len(summary.html) < 500: output = parse(self.input_doc, self.options.get("url")) remove_unlikely_candidates(output) o = open("something-wrong.txt", "w") print("[something-wrong]", tostring(output), file=o) return Summary( get_clean_html(output), 0, short_title=shorten_title(output), title=get_title(output), description=get_description(output), keywords=get_keywords(output), ) else: return summary
def test_equal_hashes(self): dom1 = fragment_fromstring("<div>ľščťžýáí</div>") dom2 = fragment_fromstring("<div>ľščťžýáí</div>") hash_dom1 = generate_hash_id(dom1) hash_dom2 = generate_hash_id(dom2) self.assertEqual(hash_dom1, hash_dom2) hash_none1 = generate_hash_id(None) hash_none2 = generate_hash_id(None) self.assertEqual(hash_none1, hash_none2)
def test_equal_hashes(): dom1 = fragment_fromstring("<div>ľščťžýáí</div>") dom2 = fragment_fromstring("<div>ľščťžýáí</div>") hash_dom1 = generate_hash_id(dom1) hash_dom2 = generate_hash_id(dom2) assert hash_dom1 == hash_dom2 hash_none1 = generate_hash_id(None) hash_none2 = generate_hash_id(None) assert hash_none1 == hash_none2
def test_id_hits(): """If the id is in the list then it gets a weight""" test_div = '<div id="post">Content</div>' node = fragment_fromstring(test_div) assert get_class_weight(node) == 25 test_div = '<div id="comments">Content</div>' node = fragment_fromstring(test_div) assert get_class_weight(node) == -25
def insert_into_last_element(html, element): """ function to insert an html element into another html fragment example: html = '<p>paragraph1</p><p>paragraph2...</p>' element = '<a href="/read-more/">read more</a>' ---> '<p>paragraph1</p><p>paragraph2...<a href="/read-more/">read more</a></p>' """ try: item = fragment_fromstring(element) except ParserError, TypeError: item = fragment_fromstring('<span></span>')
def test_scores_collide(self): """We might hit both positive and negative scores. Positive and negative scoring is done independently so it's possible to hit both positive and negative scores and cancel each other out. """ test_div = '<div id="post" class="something comment">Content</div>' node = fragment_fromstring(test_div) self.assertEqual(get_class_weight(node), 0) test_div = '<div id="post" class="post comment">Content</div>' node = fragment_fromstring(test_div) self.assertEqual(get_class_weight(node), 25)
def format_comments(comments=None, article_id=None): template_data = { 'user_activity': '', 'article_id': article_id,} comment_box = ('<form class="comment-form" name="comment-form" action="/comment-on?id=%s" method="post">' '<textarea class="comment-text" name="comment-text" title="add your comment..."></textarea>' '</form>' % article_id) #todo - build comment tree by replacing and adding. #todo - add report abuse. path = os.path.join(os.path.dirname(__file__), 'comment-table-template.html' ) all_comments = '<div class="below-video comments">Comments:<table>' template_data.update({'comment_id': len(comments)}) tree = fragment_fromstring(template.render(path, template_data), create_parent=False) all_comments += tostring(tree.xpath('//tfoot')[0])#needs better element addressing all_comments += '<tbody id="comment-table-' + str(article_id) + '">' comment_id = 0 user = oAuthUsers.get_current_user() for comment in comments: nickname = str(loads(str(comment))[1]).split('@',2)[0] dispNickname = nickname if user: #The display nickname will break the code to comment, leave as is #if the author actually matches up if user.isAuthor(comment=comment): dispNickname = nickname #Make it obvious who is the owner elif dispNickname != '': dispNickname = '['+nickname+']' template_data.update({ 'comment_id': str(comment_id), 'comment_display': loads(str(comment))[0], 'nickname': dispNickname, 'comment_date': loads(str(comment))[2], 'time_now': datetime.now(), 'user_url': 'by-author?author='+urllib.quote(nickname), }) tree = fragment_fromstring(template.render(path, template_data), create_parent=False) if nickname != '': all_comments += tostring(tree.xpath('//tr')[1]) else: all_comments += tostring(tree.xpath('//tr')[2]) #deleted comment tr comment_id += 1 #place an empty hidden comment last template_data.update({'comment_id': len(comments)}) tree = fragment_fromstring(template.render(path, template_data), create_parent=False) all_comments += tostring(tree.xpath('//tr')[3]) #hidden comment tr all_comments += '</tbody></table></div>' return all_comments
def adjust_dom(cls, root): """ adjust paged dom. 1. add id for navigationBar 2. generate shadow node of navigationBar """ i = 1 for child in root.find_class("dnavb"): child.set("id", "%s_%d" % ("dnavb", i)) child.set("class", child.get("class", "") + " dnavh") shadow = p.fragment_fromstring('<div class="dnavg show"></div>') child.insert(0, shadow) for anchor in child.findall(".//a")[:3]: shadow.append(copy.deepcopy(anchor)) shadow.append(p.fragment_fromstring("<a>...</a>")) i += 1
def test_kwargs(): template = ''' {% activeurl parent_tag='div' css_class='current' %} <div> <div> <a href="/other_page/">other_page</a> </div> <div> <a href="/page/">page</a> </div> </div> {% endactiveurl %} ''' context = {'request': requests.get('/page/')} html = render(template, context) tree = fragment_fromstring(html) div_elements = tree.xpath('//div') active_div = div_elements[-1] assert active_div.attrib.get('class', False) assert 'current' == active_div.attrib['class'] for inactive_div in div_elements[:-1]: assert not inactive_div.attrib.get('class', False)
def asHTML(self): # The network events portlet is different. Everything is different. portlet = fragment_fromstring('<div class="generic-portlet"/>') heading = SubElement(portlet, 'h3') heading.text = "Staff Calendar" # Now the entries entries = self.entries if entries: ul = SubElement(portlet, 'ul', id='calendar_portlet') event_style = 'text-decoration:none' date_format = '%m/%d/%Y' #'%A, %B %d, %Y %I:%M %p' for entry in self.entries: li = SubElement(ul, 'li') span1 = SubElement(li, 'span') span1.text = entry['startDate'].strftime(date_format) span2 = SubElement(li, 'span') span2.set('class', 'event_title') a = SubElement(span2, 'a', href=entry['href'], style=event_style) a.text = entry['title'] else: msg = SubElement(portlet, 'p') msg.text = "No entries found" # Close out with the more link more = SubElement(portlet, 'p') more.set('class', 'more') more_a = SubElement(more, 'a', href=self.href) more_a.text = 'MORE' return tostring(portlet, pretty_print=True)
def summary(self): doc = self._html(True) parsed_urls = set() url = self.options['url'] if url is not None: parsed_urls.add(url) page_0 = get_article(doc, self.options) if page_0.html: # we fetch page_0 only for now. return page_0 next_page_url = find_next_page_url(parsed_urls, url, doc) page_0_doc = fragment_fromstring(page_0.html) page_index = 0 make_page_elem(page_index, page_0_doc) article_doc = B.DIV(page_0_doc) article_doc.attrib['id'] = 'article' if next_page_url is not None: append_next_page( get_article, parsed_urls, page_index + 1, next_page_url, article_doc, self.options ) return Summary(page_0.confidence, tostring(article_doc))
def citation2latex(s): """Parse citations in Markdown cells. This looks for HTML tags having a data attribute names `data-cite` and replaces it by the call to LaTeX cite command. The tranformation looks like this: `<cite data-cite="granger">(Granger, 2013)</cite>` Becomes `\\cite{granger}` Any HTML tag can be used, which allows the citations to be formatted in HTML in any manner. """ try: from lxml import html except ImportError: return s tree = html.fragment_fromstring(s, create_parent='div') _process_node_cite(tree) s = html.tostring(tree) if s.endswith('</div>'): s = s[:-6] if s.startswith('<div>'): s = s[5:] return s
def asHTML(self): """Use lxml to generate a customizable via adapter representation""" portlet = fragment_fromstring('<div class="generic-portlet""/>') heading = SubElement(portlet, 'h3') heading.text = self.context.title # Now the entries entries = self.entries if entries: for entry in self.entries: item = SubElement(portlet, 'p') item_a = SubElement(item, 'a', href=entry['href']) item_a.text = entry['title'] else: msg = SubElement(portlet, 'p') msg.text = "No entries found" # Close out with the more link more = SubElement(portlet, 'p') more.set('class', 'more') more_a = SubElement(more, 'a', href=self.href) more_a.text = 'MORE ' + self.title return tostring(portlet, pretty_print=True)
def test_no_parent_submenu(): template = ''' {% activeurl parent_tag='self' %} <div> <a href="/menu/">menu</a> <hr> <a href="/menu/submenu/">submenu</a> <hr> <a href="/menu/other_submenu/">other_submenu</a> </div> {% endactiveurl %} ''' context = {'request': requests.get('/menu/submenu/')} html = render(template, context) tree = fragment_fromstring(html) a_elements = tree.xpath('//a') active_menu = a_elements[0] assert active_menu.attrib.get('class', False) assert 'active' == active_menu.attrib['class'] active_submenu = a_elements[1] assert active_submenu.attrib.get('class', False) assert 'active' == active_submenu.attrib['class'] inactive_submenu = a_elements[2] assert not inactive_submenu.attrib.get('class', False)
def get_article(candidates, best_candidate): # Now that we have the top candidate, look through its siblings for # content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) # create a new html document with a html->body->div output = fragment_fromstring('<div/>') best_elem = best_candidate['elem'] for sibling in best_elem.getparent().getchildren(): # in lxml there no concept of simple text # if isinstance(sibling, NavigableString): continue append = False if sibling is best_elem: append = True sibling_key = sibling # HashableElement(sibling) if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.tag == "p": link_density = get_link_density(sibling) node_content = sibling.text or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content): append = True if append: # We don't want to append directly to output, but the div # in html->body->div output.append(sibling) #if output is not None: # output.append(best_elem) return output
def test_non_active_root(): template = ''' {% activeurl %} <ul> <li> <a href="/">root</a> </li> <li> <a href="/page/">page</a> </li> </ul> {% endactiveurl %} ''' context = {'request': requests.get('/page/')} html = render(template, context) tree = fragment_fromstring(html) li_elements = tree.xpath('//li') inactive_li = li_elements[0] assert not inactive_li.attrib.get('class', False) active_li = li_elements[1] assert 'active' == active_li.attrib['class']
def test_submenu_no_menu(): template = ''' {% activeurl menu='no' %} <ul> <li> <a href="/menu/submenu/">submenu</a> </li> <li> <a href="/menu/other_submenu/">other_submenu</a> </li> <li> <a href="/menu/">menu</a> </li> </ul> {% endactiveurl %} ''' context = {'request': requests.get('/menu/submenu/')} html = render(template, context) tree = fragment_fromstring(html) li_elements = tree.xpath('//li') active_menu = li_elements[0] assert active_menu.attrib.get('class', False) assert 'active' == active_menu.attrib['class'] for inactive_submenu in li_elements[1:]: assert not inactive_submenu.attrib.get('class', False)
def test_kwargs_multiple_urls(): template = ''' {% activeurl parent_tag='p' css_class='highlight' %} <div> <p> <a href="/other_page/">other_page</a> </p> <p> <a href="/page/">page</a> <br> <a href="/other_page/">other_page</a> </p> </div> {% endactiveurl %} ''' context = {'request': requests.get('/page/')} html = render(template, context) tree = fragment_fromstring(html) p_elements = tree.xpath('//p') active_p = p_elements[1] assert active_p.attrib.get('class', False) assert 'highlight' == active_p.attrib['class'] inactive_p = p_elements[0] assert not inactive_p.attrib.get('class', False)
def test_basic_again_test_default_settings(): template = ''' {% activeurl %} <ul> <li> <a href="/page/">page</a> </li> <li> <a href="/other_page/">other_page</a> </li> </ul> {% endactiveurl %} ''' context = {'request': requests.get('/page/')} html = render(template, context) tree = fragment_fromstring(html) li_elements = tree.xpath('//li') active_li = li_elements[0] assert active_li.attrib.get('class', False) assert 'active' == active_li.attrib['class'] inactive_li = li_elements[1] assert not inactive_li.attrib.get('class', False)
def test_disabled_menu_root_path(): template = ''' {% activeurl menu='no' %} <ul> <li> <a href="/">root</a> </li> <li> <a href="/other_page/">other_page</a> </li> </ul> {% endactiveurl %} ''' context = {'request': requests.get('/')} html = render(template, context) tree = fragment_fromstring(html) li_elements = tree.xpath('//li') active_li = li_elements[0] assert active_li.attrib.get('class', False) assert 'active' == active_li.attrib['class'] inactive_li = li_elements[1] assert not inactive_li.attrib.get('class', False)
def plain2(text): he = fragment_fromstring(text, create_parent="div") for tag in he.iterdescendants(): tag.drop_tag() return he.text
def html_to_lxml(raw): raw = '<div>%s</div>' % raw root = html.fragment_fromstring(raw) root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding=None) try: return safe_xml_fromstring(raw, recover=False) except: for x in root.iterdescendants(): remove = [] for attr in x.attrib: if ':' in attr: remove.append(attr) for a in remove: del x.attrib[a] raw = etree.tostring(root, encoding=None) try: return safe_xml_fromstring(raw, recover=False) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw)
def test_empty_css_class(): template = ''' {% activeurl %} <ul> <li class=""> <a href="/page/">page</a> </li> </ul> {% endactiveurl %} ''' context = {'request': requests.get('/page/')} html = render(template, context) tree = fragment_fromstring(html) li_elements = tree.xpath('//li') active_li = li_elements[0] assert active_li.attrib.get('class', False) assert 'active' == active_li.attrib['class']
def read_homer_table(fn): par_dir = os.path.dirname(os.path.realpath(fn)) with open(fn, 'r') as f: data = ''.join(f.readlines()) soup = bs.BeautifulSoup(data, 'lxml') table = soup.find('table') homer_table = str(table).replace('homerResults', os.path.join(par_dir, 'homerResults')) html_table = html.fragment_fromstring(homer_table) top_row = 5 row_counter = 0 for row in html_table.iterchildren(): row_counter += 1 row.remove(row.getchildren()[-1]) row.remove(row.getchildren()[-1]) if row_counter >= top_row: row.clear() html_table = str( html.tostring(html_table, encoding='unicode', with_tail=False)) return html_table
def test_nested_submenu(): template = ''' {% activeurl parent_tag="div" %} <div> <div> <a href="/menu/">menu</a> <div> <a href="/menu/submenu/">submenu</a> </div> <div> <a href="/menu/other_submenu/">other_submenu</a> </div> </div> </div> {% endactiveurl %} ''' context = {'request': requests.get('/menu/submenu/')} html = render(template, context) tree = fragment_fromstring(html) div_elements = tree.xpath('//div') active_menu = div_elements[1] assert active_menu.attrib.get('class', False) assert 'active' == active_menu.attrib['class'] active_submenu = div_elements[2] assert active_submenu.attrib.get('class', False) assert 'active' == active_submenu.attrib['class'] inactive_submenu = div_elements[3] assert not inactive_submenu.attrib.get('class', False) inactive_root = div_elements[0] assert not inactive_root.attrib.get('class', False)
def asHTML(self): # The network events portlet is different. Everything is different. portlet = fragment_fromstring('<div class="generic-portlet"/>') heading = SubElement(portlet, 'h3') heading.text = self.context.title # Now the entries entries = self.entries if entries: ul = SubElement(portlet, 'ul', id='events_portlet') event_style = 'text-decoration:none' date_format = '%m/%d/%Y' #'%A, %B %d, %Y %I:%M %p' for entry in self.entries: li = SubElement(ul, 'li') #tr = SubElement(table, 'tr') #td = SubElement(tr, 'td') #td.set('class', 'event_title') span1 = SubElement(li, 'span') span1.text = entry['startDate'].strftime(date_format) span1.set('class', 'globalize-short-date') span2 = SubElement(li, 'span') span2.set('class', 'event_title') a = SubElement(span2, 'a', href=entry['href'], style=event_style) a.text = entry['title'] #td2 = SubElement(tr, 'td') else: msg = SubElement(portlet, 'p') msg.text = "No entries found" # Close out with the more link more = SubElement(portlet, 'p') more.set('class', 'more') more_a = SubElement(more, 'a', href=self.href) more_a.text = 'MORE' return tostring(portlet, pretty_print=True)
def __init__(self, id, title, url, author, summary, published, content): from lxml import html self.downloaded = False self.id = id if not title or not isinstance(title, string_or_bytes): title = _('Unknown') title = force_unicode(title, 'utf-8') self._title = clean_xml_chars(title).strip() try: self._title = re.sub(r'&(\S+?);', entity_to_unicode, self._title) except: pass self._title = clean_ascii_chars(self._title) self.url = url self.author = author self.toc_thumbnail = None self.internal_toc_entries = () if author and not isinstance(author, str): author = author.decode('utf-8', 'replace') if summary and not isinstance(summary, str): summary = summary.decode('utf-8', 'replace') summary = clean_xml_chars(summary) if summary else summary self.summary = summary if summary and '<' in summary: try: s = html.fragment_fromstring(summary, create_parent=True) summary = html.tostring(s, method='text', encoding='unicode') except: print('Failed to process article summary, deleting:') print(summary.encode('utf-8')) traceback.print_exc() summary = '' self.text_summary = clean_ascii_chars(summary) self.author = author self.content = content self.date = published self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True) self.localtime = self.utctime.astimezone(local_tz) self._formatted_date = None
def test_ignore_href_only_hash(): template = ''' {% activeurl %} <ul> <li> <a href="#">page</a> </li> <li> <a href="/other_page/">other_page</a> </li> </ul> {% endactiveurl %} ''' context = {'request': requests.get('/page/?foo=bar&bar=foo')} html = render(template, context) tree = fragment_fromstring(html) li_elements = tree.xpath('//li') assert not li_elements[0].attrib.get('class', False) assert not li_elements[1].attrib.get('class', False)
def get_value(self, context): html = self.src(context) html_root = fragment_fromstring(html, create_parent=True) selector = CSSSelector("h1,h2,h3,h4,h5,h6,h7") root = [{"level": 0, "children": []}] for h in selector(html_root): if not h.text: continue level = int(h.tag.decode("utf-8")[1:]) title = h.text if not isinstance(title, text_type): title = title.decode("utf-8") depth = root while depth and level > depth[-1]["level"]: depth = depth[-1]["children"] depth.append({"level": level, "title": title, "children": []}) return root[0]["children"]
def build_base_document(html, fragment=True): """Return a base document with the body as root. :param html: Parsed Element object :param fragment: Should we return a <div> doc fragment or a full <html> doc. """ if html.tag == 'body': html.tag = 'div' found_body = html else: found_body = html.find('.//body') if found_body is None: frag = fragment_fromstring('<div/>') frag.set('id', 'readabilityBody') frag.append(html) if not fragment: output = fromstring(BASE_DOC) insert_point = output.find('.//body') insert_point.append(frag) else: output = frag else: found_body.tag = 'div' found_body.set('id', 'readabilityBody') if not fragment: output = fromstring(BASE_DOC) insert_point = output.find('.//body') insert_point.append(found_body) else: output = found_body output.doctype = "<!DOCTYPE html>" return output
def test_html_simplest(): parts = { 'index': trim(""" My Document! It contains a #[Tag] and a %[Tag]. ` Part One """), 'part-one': trim(""" Or an #[alias: Tag, subtag]? """), } outline = Outline(parts, default_counters()) index = Index(outline) index.tags = {'tag': {'subtag': {'1.1': ['LINK']}}} out = index.html() dom = html.fragment_fromstring(out, create_parent='body')[0] assert len(dom.cssselect('div.indent-first-line')) == 1 assert 'LINK' in out
def fix_links(self,content, absolute_prefix): """ Rewrite relative links to be absolute links based on certain URL. @param content: HTML snippet as a string """ if type(content) == str: content = content.decode("utf-8") parser = etree.HTMLParser() content = content.strip() tree = html.fragment_fromstring(content, create_parent=True) def join(base, url): """ Join relative URL """ if not (url.startswith("/") or "://" in url): return urlparse.urljoin(base, url) else: # Already absolute return url for node in tree.xpath('//*[@src]'): url = node.get('src') url = join(absolute_prefix, url) node.set('src', url) for node in tree.xpath('//*[@href]'): href = node.get('href') url = join(absolute_prefix, href) node.set('href', url) data = etree.tostring(tree, pretty_print=False, encoding="utf-8") return data
def test_kwargs_multiple_urls_nested_tags(): template = ''' {% activeurl parent_tag='tr' css_class='active_row' %} <div> <table> <tr> <td> <a href="/page/">page</a> </td> <td> <a href="/other_page/">other_page</a> </td> </tr> <tr> <td> <a href="/other_page/">other_page</a> </td> </tr> </table> </div> {% endactiveurl %} ''' context = {'request': requests.get('/page/')} html = render(template, context) tree = fragment_fromstring(html) tr_elements = tree.xpath('//tr') active_tr = tr_elements[0] assert active_tr.attrib.get('class', False) assert 'active_row' == active_tr.attrib['class'] inactive_tr = tr_elements[1] assert not inactive_tr.attrib.get('class', False)
def exportSolution(self, parent, solution): """:returns: An XML node with the details of an :obj:`euphorie.content.solution`.""" node = etree.SubElement(parent, "solution") if getattr(solution, "external_id", None): node.attrib["external-id"] = solution.external_id etree.SubElement(node, "description").text = StripUnwanted( solution.description) stripped_action = StripUnwanted(solution.action) if ISolution.providedBy(solution) and self.is_etranslate_compatible: solution_view = api.content.get_view(context=solution, name="nuplone-view", request=self.request) action_with_br = stripped_action.replace("\n", "<br/>") action_html = solution_view.render_md(action_with_br) fragment = html.fragment_fromstring(action_html, "action") node.append(fragment) else: etree.SubElement(node, "action").text = stripped_action if solution.requirements: etree.SubElement(node, "requirements").text = StripUnwanted( solution.requirements) return node
def append_next_page(get_article_func, parsed_urls, page_index, page_url, doc, options): logging.debug('appending next page: %s' % page_url) if page_index >= MAX_PAGES: return fetcher = options['urlfetch'] try: html = fetcher.urlread(page_url) except Exception as e: logging.warning('exception fetching %s' % page_url, exc_info=True) return orig_page_doc = parse(html, page_url) next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc) page_article = get_article_func(orig_page_doc, options) page_doc = fragment_fromstring(page_article.html) make_page_elem(page_index, page_doc) if not is_suspected_duplicate(doc, page_doc): doc.append(page_doc) if next_page_url is not None: append_next_page(get_article_func, parsed_urls, page_index + 1, next_page_url, doc, options)
def get_article(self, candidates, best_candidate): # Now that we have the top candidate, look through its siblings for # content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max(10, best_candidate.score * 0.2) # create a new html document with a div output = fragment_fromstring("<div/>") parent = best_candidate.elem.getparent() siblings = parent.getchildren() if parent is not None else [ best_candidate.elem ] for sibling in siblings: # in lxml there no concept of simple text # if isinstance(sibling, NavigableString): continue append = False # conditions if sibling == best_candidate.elem: append = True elif (sibling in candidates and candidates[sibling].score >= sibling_score_threshold): append = True elif sibling.tag == "p": link_density = self.get_link_density(sibling) node_content = sibling.text or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif (node_length <= 80 and link_density == 0 and re.search(r"\.( |$)", node_content)): append = True # append to the output div if append: output.append(sibling) #if output is not None: # output.append(best_candidate.elem) return output
def parse(self, source, classname=DEFAULT_CLASS_NAME, use_cache=True, language=''): """Parses input HTML code into word chunks and organized code. Args: source: HTML code to be processed (unicode). classname: A class name of each word chunk in the HTML code (string). user_cache: Whether to use cache (boolean). language: A language used to parse text (string). Returns: A dictionary with the list of word chunks and organized HTML code. """ if use_cache: cache_shelve = shelve.open(CACHE_FILE_NAME) cache_key = self._get_cache_key(source, classname) result_value = cache_shelve.get(cache_key, None) cache_shelve.close() if result_value: return result_value source = self._preprocess(source) dom = html.fragment_fromstring(source, create_parent='body') input_text = dom.text_content() chunks = self._get_source_chunks(input_text, language) chunks = self._concatenate_punctuations(chunks) chunks = self._concatenate_by_label(chunks, True) chunks = self._concatenate_by_label(chunks, False) chunks = self._migrate_html(chunks, dom) html_code = self._spanize(chunks, classname) result_value = {'chunks': chunks, 'html_code': html_code} if use_cache: cache_shelve = shelve.open(CACHE_FILE_NAME) cache_shelve[cache_key] = result_value cache_shelve.close() return result_value
def parse_angaben(engine, data): if not data.get('angaben'): return snippet = '<x>' + data['angaben'] + '</x>' doc = html.fragment_fromstring(snippet) table = sl.get_table(engine, 'angaben') data = {'source_url': data['source_url']} wrapped_name = False for el in doc: if el.tag == 'h3': wrapped_name = False data['section'] = el.text.split('. ', 1)[-1] elif el.tag == 'strong' or not el.text or not el.get('class'): continue elif 'voa_abstand' in el.get('class') or wrapped_name: client = el.text if wrapped_name: client = data['client'] + ' ' + client data['client'] = client client.strip().strip(',') els = client.rsplit(',', 2) if len(els) == 3: wrapped_name = False data['client_name'] = els[0].strip() data['client_city'] = els[1].strip() else: wrapped_name = True continue else: data['service'] = el.text data['level'] = 'Stufe 0' for name in LEVELS: if name.lower() in data['service'].lower(): data['level'] = name sl.upsert(engine, table, data, ['source_url', 'section', 'client', 'service'])
def data_from_table(table): data_tables = table.xpath("table[@align='center']") stops_table = data_tables[0] time_table = data_tables[1] stops = [tr[1].text_content() for tr in stops_table.xpath("tr")] start_times = [ int(td.text_content().replace(":", "")) for td in time_table.xpath("tr/td") if td.text_content().replace(":", "") != '' ] header = table.xpath("div[@class='enTripGroupInfo']")[0] header_text = html.fragment_fromstring( html.tostring(header).replace("\n", " ").replace( "<br>", "\n")).text_content().replace('To\n', 'To').replace('-\n', '-').replace('-', '') data_lines = [h.strip() for h in header_text.split('\n')] data = {'stops': stops, 'start_times': sorted(start_times)} for dl in data_lines: s = dl.split(":") data[s[0].strip()] = s[1].strip() return data
def sanitize_richtext(text): if defaults.DJANGOCMS_BASEPLUGINS_LXML_CLEANER_CONFIG: if lxml_clean: lxml_cleaner = lxml_clean.Cleaner( **defaults.DJANGOCMS_BASEPLUGINS_LXML_CLEANER_CONFIG) fragment = fragment_fromstring("<div>" + text + "</div>") fragment = lxml_cleaner.clean_html(fragment) text = tostring(fragment, encoding='unicode') if text.startswith('<div>'): # still dont like lxml! text = text[len('<div>'):-len('</div>')] elif settings.DEBUG: print( "lxml is not installed, but should be, for sanitizing richtext content!" ) if defaults.DJANGOCMS_BASEPLUGINS_BLEACH_CONFIG: if bleach: text = bleach.clean(text, **defaults.DJANGOCMS_BASEPLUGINS_BLEACH_CONFIG) elif settings.DEBUG: print( "bleach is not installed, but should be, for sanitizing richtext content!" ) return text
def generate_markup_fragment( name_dict={'en': 'English Output'}, langs=['en'], url='test-url', saveValueName='saveValue', postSave='postSave', containerClass='containerClass', iconClass='iconClass', readOnlyClass='readOnlyClass', disallow_edit='false' ): markup = inline_edit_trans( name_dict, langs, url, saveValueName, postSave, containerClass, iconClass, readOnlyClass, disallow_edit ) return fragment_fromstring(markup)
def test_submenu(): template = ''' {% activeurl %} <ul> <li> <a href="/menu/">menu</a> </li> <li> <a href="/menu/submenu/">submenu</a> </li> <li> <a href="/menu/other_submenu/">other_submenu</a> </li> </ul> {% endactiveurl %} ''' context = {'request': requests.get('/menu/submenu/')} html = render(template, context) tree = fragment_fromstring(html) li_elements = tree.xpath('//li') active_menu = li_elements[0] assert active_menu.attrib.get('class', False) assert 'active' == active_menu.attrib['class'] active_submenu = li_elements[1] assert active_submenu.attrib.get('class', False) assert 'active' == active_submenu.attrib['class'] inactive_submenu = li_elements[2] assert not inactive_submenu.attrib.get('class', False)
def response(resp): results = [] xmldom = etree.fromstring(resp.content) xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0) dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div') for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'): url = urljoin(base_url, link.attrib.get('href')) title = extract_text(link) thumbnail_src = urljoin( gallery_url, eval_xpath_getindex(link, './/img', 0).attrib['src']) # append result results.append({ 'url': url, 'title': title, 'img_src': thumbnail_src, 'content': '', 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) # return results return results
def _build_datatable(self, response): alist_tbody = (response.xpath('//table[1]/tbody//td').extract()) atable = [] arow = [] for item in alist_tbody: tree = html.fragment_fromstring(item) text = tree.text_content() url = tree.xpath('//a/@href') find_att_b = tree.xpath('//b/text()|//strong/text()') if len(find_att_b) >= 1: continue if url: arow.append('{name}: {url}'.format(name=text, url=url[0])) else: arow.append( '{text}'.format(text=unicodedata.normalize("NFKD", text))) if len(arow) == 6: atable.append(arow) arow = [] return atable
def load_generic_data(): html_data = requests.get('http://www.fundamentus.com.br/resultado.php') pattern = re.compile('<table id="resultado".*</table>', re.DOTALL) [table] = re.findall(pattern, html_data.text) page = fragment_fromstring(table) [thead] = page.xpath('thead') [tr] = thead.xpath('tr') headers = [th.text_content().strip() for th in tr.xpath('th')] [tbody] = page.xpath('tbody') stock_info = {} for tr in tbody.xpath('tr'): data = [ _convert_data(i.text_content().strip()) for i in tr.xpath('td') ] stock_data = dict(zip(headers, data)) tick = stock_data['Papel'] stock_info[tick] = stock_data return stock_info
def get_specific_data(stock): url = "http://www.fundamentus.com.br/detalhes.php?papel=" + stock cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'), ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01') ] # Get data from site link = opener.open(url, urllib.parse.urlencode({}).encode('UTF-8')) content = link.read().decode('ISO-8859-1') # Get all table instances pattern = re.compile('<table class="w728">.*</table>', re.DOTALL) reg = re.findall(pattern, content)[0] reg = "<div>" + reg + "</div>" page = fragment_fromstring(reg) all_data = {} # There is 5 tables with tr, I will get all trs all_trs = [] all_tables = page.xpath("table") for i in range(0, len(all_tables)): all_trs = all_trs + all_tables[i].findall("tr") # Run through all the trs and get the label and the # data for each line for tr_index in range(0, len(all_trs)): tr = all_trs[tr_index] # Get into td all_tds = tr.getchildren() for td_index in range(0, len(all_tds)): td = all_tds[td_index] label = "" data = "" # The page has tds with contents and some # other with not if (td.get("class").find("label") != -1): # We have a label for span in td.getchildren(): if (span.get("class").find("txt") != -1): label = span.text # If we did find a label we have to look # for a value if (label and len(label) > 0): next_td = all_tds[td_index + 1] if (next_td.get("class").find("data") != -1): # We have a data for span in next_td.getchildren(): if (span.get("class").find("txt") != -1): if (span.text): data = span.text else: # If it is a link span_children = span.getchildren() if (span_children and len(span_children) > 0): data = span_children[0].text # Include into dict all_data[label] = data # Erase it label = "" data = "" return all_data
def parse_html(text): return html.fragment_fromstring(text, parser=_HTML_PARSER)
def build_candidates(length): html = "<p>%s</p>" % ("c" * length) node = fragment_fromstring(html) return [node]
def _markdown_fragment(target, image): if not image: images_left = 0 elif type(image) is int: images_left = image else: images_left = 5 rendered = _markdown(target) fragment = html.fragment_fromstring(rendered, create_parent=True) for link in fragment.findall(".//a"): href = link.attrib.get("href") if href: t, _, user = href.partition(":") if t == "user": link.attrib["href"] = u"/~{user}".format(user=login_name(user)) elif t == "da": link.attrib["href"] = u"https://{user}.deviantart.com/".format( user=_deviantart(user)) elif t == "ib": link.attrib["href"] = u"https://inkbunny.net/{user}".format( user=_inkbunny(user)) elif t == "fa": link.attrib[ "href"] = u"https://www.furaffinity.net/user/{user}".format( user=_furaffinity(user)) elif t == "sf": link.attrib["href"] = u"https://{user}.sofurry.com/".format( user=_sofurry(user)) else: continue if not link.text or link.text == href: link.text = user for parent in fragment.findall(".//*[img]"): for image in list(parent): if image.tag != "img": continue src = image.get("src") if src: t, _, user = src.partition(":") if t != "user": if images_left: images_left -= 1 else: i = list(parent).index(image) link = etree.Element(u"a") link.tail = image.tail src = image.get("src") if src: link.set(u"href", src) link.text = image.attrib.get("alt", src) parent[i] = link continue image.set(u"src", u"/~{user}/avatar".format(user=login_name(user))) link = etree.Element(u"a") link.set(u"href", u"/~{user}".format(user=login_name(user))) link.set(u"class", u"user-icon") parent.insert(list(parent).index(image), link) parent.remove(image) link.append(image) link.tail = image.tail if "alt" in image.attrib and image.attrib["alt"]: image.tail = u" " label = etree.SubElement(link, u"span") label.text = image.attrib["alt"] del image.attrib["alt"] else: image.tail = None image.set(u"alt", user) add_user_links(fragment, None, True) defang(fragment) return fragment
def get_data(*args, **kwargs): url = 'http://www.fundamentus.com.br/resultado.php' cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'), ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01') ] # Aqui estão os parâmetros de busca das ações # Estão em branco para que retorne todas as disponíveis data = { 'pl_min': '', 'pl_max': '', 'pvp_min': '', 'pvp_max': '', 'psr_min': '', 'psr_max': '', 'divy_min': '', 'divy_max': '', 'pativos_min': '', 'pativos_max': '', 'pcapgiro_min': '', 'pcapgiro_max': '', 'pebit_min': '', 'pebit_max': '', 'fgrah_min': '', 'fgrah_max': '', 'firma_ebit_min': '', 'firma_ebit_max': '', 'margemebit_min': '', 'margemebit_max': '', 'margemliq_min': '', 'margemliq_max': '', 'liqcorr_min': '', 'liqcorr_max': '', 'roic_min': '', 'roic_max': '', 'roe_min': '', 'roe_max': '', 'liq_min': '', 'liq_max': '', 'patrim_min': '', 'patrim_max': '', 'divbruta_min': '', 'divbruta_max': '', 'tx_cresc_rec_min': '', 'tx_cresc_rec_max': '', 'setor': '', 'negociada': 'ON', 'ordem': '1', 'x': '28', 'y': '16' } with opener.open(url, urllib.parse.urlencode(data).encode('UTF-8')) as link: content = link.read().decode('ISO-8859-1') pattern = re.compile('<table id="resultado".*</table>', re.DOTALL) reg = re.findall(pattern, content)[0] page = fragment_fromstring(reg) lista = OrderedDict() stocks = page.xpath('tbody')[0].findall("tr") for i in range(0, len(stocks)): lista[i] = { stocks[i].getchildren()[0][0].getchildren()[0].text: { 'cotacao': stocks[i].getchildren()[1].text, 'P/L': stocks[i].getchildren()[2].text, 'P/VP': stocks[i].getchildren()[3].text, 'PSR': stocks[i].getchildren()[4].text, 'DY': stocks[i].getchildren()[5].text, 'P/Ativo': stocks[i].getchildren()[6].text, 'P/Cap.Giro': stocks[i].getchildren()[7].text, 'P/EBIT': stocks[i].getchildren()[8].text, 'P/Ativ.Circ.Liq.': stocks[i].getchildren()[9].text, 'EV/EBIT': stocks[i].getchildren()[10].text, 'EBITDA': stocks[i].getchildren()[11].text, 'Mrg. Ebit': stocks[i].getchildren()[12].text, 'Mrg.Liq.': stocks[i].getchildren()[13].text, 'Liq.Corr.': stocks[i].getchildren()[14].text, 'ROIC': stocks[i].getchildren()[15].text, 'ROE': stocks[i].getchildren()[16].text, 'Liq.2m.': stocks[i].getchildren()[17].text, 'Pat.Liq': stocks[i].getchildren()[18].text, 'Div.Brut/Pat.': stocks[i].getchildren()[19].text, 'Cresc.5a': stocks[i].getchildren()[20].text } } return lista
def tree(self): """Wrap the HTML as a Scrapy selector. Returns: Selector """ return html.fragment_fromstring(self.html)