def parse_item(self, item_node): """Implementation of a method called from NewsFeedParser""" parsed_item = {} try: parsed_item['title'] = \ item_node.getElementsByTagName('title')[0] \ .childNodes[0].nodeValue except IndexError: parsed_item['title'] = '' parsed_item['content'] = '' parsed_item['image_banner'] = '' parsed_item['image_thumbnail'] = '' try: desc_raw = \ item_node.getElementsByTagName('content')[0] \ .childNodes[0].nodeValue desc_html = html_unescape(desc_raw) match = re.match(r'.*?<img src="([^"]*).*', desc_html) if match: parsed_item['image_thumbnail'] = match.group(1) parsed_item['image_banner'] = match.group(1) match = re.match( r'.*?<div class="lh">.*?<br>.*?<br>.*?' r'<font[^>]*>(.*?)</font>.*', desc_html) if match: parsed_item['content'] = re.sub(r'</?b>', '', match.group(1)) except IndexError: pass try: parsed_item['link'] = \ item_node.getElementsByTagName('link')[0].getAttribute('href') except IndexError: parsed_item['link'] = '' try: updated = \ item_node.getElementsByTagName('updated')[0] \ .childNodes[0].nodeValue # 2015-06-02T22:11:58Z parsed_item['date_time'] = \ datetime(*strptime(updated, '%Y-%m-%dT%H:%M:%SZ')[0:6]) \ .strftime('%a, %d %b %Y %H:%M:%S GMT') except IndexError: parsed_item['date_time'] = '' return parsed_item
def parse_item(self, node): """Implementation of abstract method defined in NewsFeedParser""" parsed_item = {} try: parsed_item['title'] = \ node.getElementsByTagName('title')[0] \ .childNodes[0].nodeValue except IndexError: parsed_item['title'] = '' parsed_item['content'] = '' parsed_item['image_banner'] = '' parsed_item['image_thumbnail'] = '' try: desc_raw = \ node.getElementsByTagName('content')[0] \ .childNodes[0].nodeValue desc_html = html_unescape(desc_raw) match = re.match(r'.*?<img src="([^"]*).*', desc_html) if match: parsed_item['image_thumbnail'] = match.group(1) parsed_item['image_banner'] = match.group(1) match = re.match(r'.*?<div class="lh">.*?<br>.*?<br>.*?' r'<font[^>]*>(.*?)</font>.*', desc_html) if match: parsed_item['content'] = re.sub(r'</?b>', '', match.group(1)) except IndexError: pass try: parsed_item['link'] = \ node.getElementsByTagName('link')[0].getAttribute('href') except IndexError: parsed_item['link'] = '' try: updated = \ node.getElementsByTagName('updated')[0] \ .childNodes[0].nodeValue # 2015-06-02T22:11:58Z parsed_item['date_time'] = \ datetime(*strptime(updated, '%Y-%m-%dT%H:%M:%SZ')[0:6]) \ .strftime('%a, %d %b %Y %H:%M:%S GMT') except IndexError: parsed_item['date_time'] = '' return parsed_item
def parse_item(self, node): """Implementation of abstract method defined in NewsFeedParser""" parsed_item = {} try: title_node = node.getElementsByTagName('title')[0] parsed_item['title'] = title_node.childNodes[0].nodeValue except IndexError: parsed_item['title'] = '' try: link_node = node.getElementsByTagName('link')[0] parsed_item['link'] = link_node.childNodes[0].nodeValue except IndexError: parsed_item['link'] = '' try: parsed_item['content'] = '' parsed_item['image_banner'] = '' parsed_item['image_thumbnail'] = '' description_node = node.getElementsByTagName('description')[0] desc_raw = description_node.childNodes[0].nodeValue desc_html = html_unescape(desc_raw) match = re.match(r'.*?<img src="([^"]*).*', desc_html) if match: parsed_item['image_thumbnail'] = match.group(1) parsed_item['image_banner'] = match.group(1) match = re.match( r'.*?<div class="lh">.*?<br>.*?<br>.*?' r'<font[^>]*>(.*?)</font>.*', desc_html) if match: parsed_item['content'] = re.sub(r'</?b>', '', match.group(1)) except IndexError: pass try: pub_date_node = node.getElementsByTagName('pubDate')[0] parsed_item['date_time'] = pub_date_node.childNodes[0].nodeValue # date_time: Tue, 02 Jun 2015 11:25:05 GMT except IndexError: parsed_item['date_time'] = '' return parsed_item
def parse_item(self, node): """Implementation of abstract method defined in NewsFeedParser""" parsed_item = {} try: title_node = node.getElementsByTagName('title')[0] parsed_item['title'] = title_node.childNodes[0].nodeValue except IndexError: parsed_item['title'] = '' try: link_node = node.getElementsByTagName('link')[0] parsed_item['link'] = link_node.childNodes[0].nodeValue except IndexError: parsed_item['link'] = '' try: parsed_item['content'] = '' parsed_item['image_banner'] = '' parsed_item['image_thumbnail'] = '' description_node = node.getElementsByTagName('description')[0] desc_raw = description_node.childNodes[0].nodeValue desc_html = html_unescape(desc_raw) match = re.match(r'.*?<img src="([^"]*).*', desc_html) if match: parsed_item['image_thumbnail'] = match.group(1) parsed_item['image_banner'] = match.group(1) match = re.match(r'.*?<div class="lh">.*?<br>.*?<br>.*?' r'<font[^>]*>(.*?)</font>.*', desc_html) if match: parsed_item['content'] = re.sub(r'</?b>', '', match.group(1)) except IndexError: pass try: pub_date_node = node.getElementsByTagName('pubDate')[0] parsed_item['date_time'] = pub_date_node.childNodes[0].nodeValue # date_time: Tue, 02 Jun 2015 11:25:05 GMT except IndexError: parsed_item['date_time'] = '' return parsed_item
def parse_item(self, node): """Implementation of abstract method defined in NewsFeedParser""" parsed_item = {} try: title_node = node.getElementsByTagName("title")[0] parsed_item["title"] = title_node.childNodes[0].nodeValue except IndexError: parsed_item["title"] = "" try: link_node = node.getElementsByTagName("link")[0] parsed_item["link"] = link_node.childNodes[0].nodeValue except IndexError: parsed_item["link"] = "" try: parsed_item["content"] = "" parsed_item["image_banner"] = "" parsed_item["image_thumbnail"] = "" description_node = node.getElementsByTagName("description")[0] desc_raw = description_node.childNodes[0].nodeValue desc_html = html_unescape(desc_raw) match = re.match(r'.*?<img src="([^"]*).*', desc_html) if match: parsed_item["image_thumbnail"] = match.group(1) parsed_item["image_banner"] = match.group(1) match = re.match(r'.*?<div class="lh">.*?<br>.*?<br>.*?' r"<font[^>]*>(.*?)</font>.*", desc_html) if match: parsed_item["content"] = re.sub(r"</?b>", "", match.group(1)) except IndexError: pass try: pub_date_node = node.getElementsByTagName("pubDate")[0] parsed_item["date_time"] = pub_date_node.childNodes[0].nodeValue # date_time: Tue, 02 Jun 2015 11:25:05 GMT except IndexError: parsed_item["date_time"] = "" return parsed_item
def test_html_unescape_no_double_unescape(self): orginal = "&lt;br&gt;" html = html_unescape(orginal) self.assertEqual("<br>", html)
def test_html_unescape(self): html = html_unescape(TestUtil.escaped_html) self.assertEqual(TestUtil.unescaped_html, html)