Example #1
0
    def parse_item(self, item_node):
        """Implementation of a method called from NewsFeedParser"""
        parsed_item = {}
        try:
            parsed_item['title'] = \
                item_node.getElementsByTagName('title')[0] \
                    .childNodes[0].nodeValue
        except IndexError:
            parsed_item['title'] = ''
        parsed_item['content'] = ''
        parsed_item['image_banner'] = ''
        parsed_item['image_thumbnail'] = ''
        try:
            desc_raw = \
                item_node.getElementsByTagName('content')[0] \
                    .childNodes[0].nodeValue
            desc_html = html_unescape(desc_raw)
            match = re.match(r'.*?<img src="([^"]*).*', desc_html)
            if match:
                parsed_item['image_thumbnail'] = match.group(1)
                parsed_item['image_banner'] = match.group(1)
            match = re.match(
                r'.*?<div class="lh">.*?<br>.*?<br>.*?'
                r'<font[^>]*>(.*?)</font>.*', desc_html)
            if match:
                parsed_item['content'] = re.sub(r'</?b>', '', match.group(1))
        except IndexError:
            pass
        try:
            parsed_item['link'] = \
                item_node.getElementsByTagName('link')[0].getAttribute('href')
        except IndexError:
            parsed_item['link'] = ''
        try:
            updated = \
                item_node.getElementsByTagName('updated')[0] \
                    .childNodes[0].nodeValue
            # 2015-06-02T22:11:58Z
            parsed_item['date_time'] = \
                datetime(*strptime(updated, '%Y-%m-%dT%H:%M:%SZ')[0:6]) \
                    .strftime('%a, %d %b %Y %H:%M:%S GMT')
        except IndexError:
            parsed_item['date_time'] = ''

        return parsed_item
    def parse_item(self, node):
        """Implementation of abstract method defined in NewsFeedParser"""
        parsed_item = {}
        try:
            parsed_item['title'] = \
                node.getElementsByTagName('title')[0] \
                    .childNodes[0].nodeValue
        except IndexError:
            parsed_item['title'] = ''
        parsed_item['content'] = ''
        parsed_item['image_banner'] = ''
        parsed_item['image_thumbnail'] = ''
        try:
            desc_raw = \
                node.getElementsByTagName('content')[0] \
                    .childNodes[0].nodeValue
            desc_html = html_unescape(desc_raw)
            match = re.match(r'.*?<img src="([^"]*).*', desc_html)
            if match:
                parsed_item['image_thumbnail'] = match.group(1)
                parsed_item['image_banner'] = match.group(1)
            match = re.match(r'.*?<div class="lh">.*?<br>.*?<br>.*?'
                             r'<font[^>]*>(.*?)</font>.*', desc_html)
            if match:
                parsed_item['content'] = re.sub(r'</?b>', '', match.group(1))
        except IndexError:
            pass
        try:
            parsed_item['link'] = \
                node.getElementsByTagName('link')[0].getAttribute('href')
        except IndexError:
            parsed_item['link'] = ''
        try:
            updated = \
                node.getElementsByTagName('updated')[0] \
                    .childNodes[0].nodeValue
            # 2015-06-02T22:11:58Z
            parsed_item['date_time'] = \
                datetime(*strptime(updated, '%Y-%m-%dT%H:%M:%SZ')[0:6]) \
                    .strftime('%a, %d %b %Y %H:%M:%S GMT')
        except IndexError:
            parsed_item['date_time'] = ''

        return parsed_item
 def parse_item(self, node):
     """Implementation of abstract method defined in NewsFeedParser"""
     parsed_item = {}
     try:
         title_node = node.getElementsByTagName('title')[0]
         parsed_item['title'] = title_node.childNodes[0].nodeValue
     except IndexError:
         parsed_item['title'] = ''
     try:
         link_node = node.getElementsByTagName('link')[0]
         parsed_item['link'] = link_node.childNodes[0].nodeValue
     except IndexError:
         parsed_item['link'] = ''
     try:
         parsed_item['content'] = ''
         parsed_item['image_banner'] = ''
         parsed_item['image_thumbnail'] = ''
         description_node = node.getElementsByTagName('description')[0]
         desc_raw = description_node.childNodes[0].nodeValue
         desc_html = html_unescape(desc_raw)
         match = re.match(r'.*?<img src="([^"]*).*', desc_html)
         if match:
             parsed_item['image_thumbnail'] = match.group(1)
             parsed_item['image_banner'] = match.group(1)
         match = re.match(
             r'.*?<div class="lh">.*?<br>.*?<br>.*?'
             r'<font[^>]*>(.*?)</font>.*', desc_html)
         if match:
             parsed_item['content'] = re.sub(r'</?b>', '', match.group(1))
     except IndexError:
         pass
     try:
         pub_date_node = node.getElementsByTagName('pubDate')[0]
         parsed_item['date_time'] = pub_date_node.childNodes[0].nodeValue
         # date_time: Tue, 02 Jun 2015 11:25:05 GMT
     except IndexError:
         parsed_item['date_time'] = ''
     return parsed_item
 def parse_item(self, node):
     """Implementation of abstract method defined in NewsFeedParser"""
     parsed_item = {}
     try:
         title_node = node.getElementsByTagName('title')[0]
         parsed_item['title'] = title_node.childNodes[0].nodeValue
     except IndexError:
         parsed_item['title'] = ''
     try:
         link_node = node.getElementsByTagName('link')[0]
         parsed_item['link'] = link_node.childNodes[0].nodeValue
     except IndexError:
         parsed_item['link'] = ''
     try:
         parsed_item['content'] = ''
         parsed_item['image_banner'] = ''
         parsed_item['image_thumbnail'] = ''
         description_node = node.getElementsByTagName('description')[0]
         desc_raw = description_node.childNodes[0].nodeValue
         desc_html = html_unescape(desc_raw)
         match = re.match(r'.*?<img src="([^"]*).*', desc_html)
         if match:
             parsed_item['image_thumbnail'] = match.group(1)
             parsed_item['image_banner'] = match.group(1)
         match = re.match(r'.*?<div class="lh">.*?<br>.*?<br>.*?'
                          r'<font[^>]*>(.*?)</font>.*', desc_html)
         if match:
             parsed_item['content'] = re.sub(r'</?b>', '', match.group(1))
     except IndexError:
         pass
     try:
         pub_date_node = node.getElementsByTagName('pubDate')[0]
         parsed_item['date_time'] = pub_date_node.childNodes[0].nodeValue
         # date_time: Tue, 02 Jun 2015 11:25:05 GMT
     except IndexError:
         parsed_item['date_time'] = ''
     return parsed_item
 def parse_item(self, node):
     """Implementation of abstract method defined in NewsFeedParser"""
     parsed_item = {}
     try:
         title_node = node.getElementsByTagName("title")[0]
         parsed_item["title"] = title_node.childNodes[0].nodeValue
     except IndexError:
         parsed_item["title"] = ""
     try:
         link_node = node.getElementsByTagName("link")[0]
         parsed_item["link"] = link_node.childNodes[0].nodeValue
     except IndexError:
         parsed_item["link"] = ""
     try:
         parsed_item["content"] = ""
         parsed_item["image_banner"] = ""
         parsed_item["image_thumbnail"] = ""
         description_node = node.getElementsByTagName("description")[0]
         desc_raw = description_node.childNodes[0].nodeValue
         desc_html = html_unescape(desc_raw)
         match = re.match(r'.*?<img src="([^"]*).*', desc_html)
         if match:
             parsed_item["image_thumbnail"] = match.group(1)
             parsed_item["image_banner"] = match.group(1)
         match = re.match(r'.*?<div class="lh">.*?<br>.*?<br>.*?' r"<font[^>]*>(.*?)</font>.*", desc_html)
         if match:
             parsed_item["content"] = re.sub(r"</?b>", "", match.group(1))
     except IndexError:
         pass
     try:
         pub_date_node = node.getElementsByTagName("pubDate")[0]
         parsed_item["date_time"] = pub_date_node.childNodes[0].nodeValue
         # date_time: Tue, 02 Jun 2015 11:25:05 GMT
     except IndexError:
         parsed_item["date_time"] = ""
     return parsed_item
Example #6
0
 def test_html_unescape_no_double_unescape(self):
     orginal = "&amp;lt;br&amp;gt;"
     html = html_unescape(orginal)
     self.assertEqual("&lt;br&gt;", html)
Example #7
0
 def test_html_unescape(self):
     html = html_unescape(TestUtil.escaped_html)
     self.assertEqual(TestUtil.unescaped_html, html)
Example #8
0
 def test_html_unescape_no_double_unescape(self):
     orginal = "&amp;lt;br&amp;gt;"
     html = html_unescape(orginal)
     self.assertEqual("&lt;br&gt;", html)
Example #9
0
 def test_html_unescape(self):
     html = html_unescape(TestUtil.escaped_html)
     self.assertEqual(TestUtil.unescaped_html, html)