def rss_file_true(self, file): try: if file.endswith('.rss'): dprint(file, "RSS file:") self.file_list.append(file) except Exception as e: print("[RSS](rss_file_true) Error: %s" % e)
def get_entries(self): try: self.list_rss_file() dprint( "RESUME FROM ALL NEWS ITEMS IN RSS MERGED TO MAKE SLIDESHOW") dprint("%s" % self.news_dict) return self.news_dict except Exception as e: print("[RSS](__init__)Error: %s" % e) return {} #def get_entries
def parse_rss_list(self, file_list_to_parse): try: #print (file_list_to_parse) if len(file_list_to_parse) == 0: dprint("Can't find rss file, please check it.") else: for file in file_list_to_parse: dprint( "\n\n[parse_rss_list] Analizing: {file_name}".format( file_name=file)) news_extracted = self.parse_rss_file(file) self.merge_news_dict(news_extracted) except Exception as e: print("[RSS][parse_rss_list]Error: %s" % e)
def list_rss_file(self, directory=None): '''Test if path is a file or directoy and execute rss_file_true for every files''' try: self.file_list = [] news_extract = {} url = "http://localhost/news/rss/" rss_decoded = tempfile.NamedTemporaryFile().name try: urllib.request.urlretrieve(url, rss_decoded) del self.file_list[:] self.news_dict.clear() with open(rss_decoded, 'rt', encoding='utf-8') as f: if 'rss' in f.read(): self.file_list.append(rss_decoded) except Exception as e: print("[RSS](list_rss_file)Can't get RSS from URL: %s" % e) dprint("List RSS files: %s" % self.file_list, "[RSS](rss_file_true)") self.parse_rss_list(self.file_list) except Exception as e: print("[RSS](list_rss_file)Error: %s" % e)
def parse_rss_file(self, rssfile): try: rss = feedparser.parse(rssfile) feed_entries = rss.entries dprint_values = { 'title': rss.feed.title, 'link': rss.feed.link, 'length': len(feed_entries) } dprint(''' ######################################## Detail from RSS Title: {title} Link: {link} News Item Number: {length} ########################################'''.format(**dprint_values)) entry_element = 0 element = {} for entry in feed_entries: dprint("\n\nNEWS ITEM {entry_number}".format( entry_number=entry_element + 1)) dprint(entry) dprint("------------------------------") article_subtitle = entry.subtitle if 'subtitle' in entry else "None" if 'content' in entry: content = entry.summary content_type = entry.content.type if "type" in entry.content else "Null" content_base = entry.content.base if "base" in entry.content else "Null" content_value = entry.content.value if "value" in entry.content else "Null" else: content = "None" dprint(''' Title: {0.title} - Link:[{0.link}] Subtitle: {article_subtitle} Id: {0.id} Author: {0.author} Published at: {0.published} Updated: {0.updated} Content: {content} Content Type: {content_type} Content Base: {content_base} Content Value: {content_value} Content XML: {content} Description: {0.description} '''.format(entry, article_subtitle=article_subtitle, content=content, content_type=content_type, content_base=content_base, content_value=content_value)) # Generate dictionary with content of news # parser content variable, because is a string dprint( "------------------------------\nRESUME FROM THIS NEWS ITEM:" ) element[entry_element] = {} soup = BeautifulSoup(content, "html.parser") element[entry_element]['title'] = entry.title element[entry_element]['type'] = "None" html = False for item in soup: if 'kg-card-begin: html' in item or 'kg-card-begin: markdown' in item: html = True break if html: element[entry_element]['text'] = '' for item in soup: if 'kg-card-begin: html' not in item and 'kg-card-end: html' not in item and 'kg-card-begin: markdown' not in item and 'kg-card-end: markdown' not in item: try: element[entry_element]['text'] = element[ entry_element]['text'] + '%s' % (item) except Exception as e: print( "[RSS](parse_rss_file)(Class Beautiful)Error: %s" % e) element[entry_element]['type'] = 'html' dprint("Notice HTML: {}".format( element[entry_element]['text'])) else: #Twitter is a text in rss news for item in soup.find_all("p"): try: if soup.find_all("p"): element[entry_element]['text'] = '%s' % ( item.text.strip()) element[entry_element]['type'] = 'text' dprint("Notice Text: {}".format( item.text.strip())) else: element[entry_element]['text'] = 'Not detected' dprint("Notice Text: Not detected") except Exception as e: print( "[RSS](parse_rss_file)(Class Beautiful)Error: %s" % e) # two checks are necessary to confirm or not type if soup.find_all("video"): if soup.find_all("video"): for item in soup.find_all("video"): try: element[entry_element]['video'] = item['src'] element[entry_element]['type'] = 'video' dprint("Notice Video: {}".format(item['src'])) except Exception as e: dprint( "[RSS](parse_rss_file)(Class Beautiful)Error: %s" % e) else: element[entry_element]['video'] = 'Not detected' dprint("Notice Video: Not detected") # two checks are necessary to confirm or not type # support to YouTube and Vimeo if soup.find_all("iframe"): if soup.find_all("iframe"): for item in soup.find_all("iframe"): try: if 'youtube' in item['src']: element[entry_element]['type'] = 'youtube' element[entry_element]['src'] = item['src'] if 'embed/' in item['src']: element[entry_element]['id'] = item[ 'src'].split("embed/")[1].split( "?")[0] else: element[entry_element]['id'] = None element[entry_element]['width'] = item[ 'width'] element[entry_element]['height'] = item[ 'height'] else: element[entry_element]['iframe'] = item element[entry_element]['type'] = 'iframe' dprint( "Element Entry YouTube/iframe: {}".format( element[entry_element])) dprint("Notice Iframe: {}".format(item)) except Exception as e: dprint( "[RSS](parse_rss_file)(Class Beautiful)Error: %s" % e) else: element[entry_element]['iframe'] = 'Not detected' dprint("Notice Iframe: Not detected") if soup.find_all("figure"): if soup.find_all("img"): # List of images add support to images gallery element[entry_element]['image'] = [] element[entry_element]['type'] = 'image' for item in soup.find_all("img"): try: element[entry_element]['image'].append( item['src']) except Exception as e: dprint( "[RSS](parse_rss_file)(Class Beautiful)Error: %s" % e) dprint("Notice Image: %s" % (element[entry_element])) else: element[entry_element]['image'] = 'Not detected' if element[entry_element]['type'] == 'iframe': pass elif element[entry_element]['type'] == 'text': pass else: dprint("Notice Image: Not detected") entry_element = entry_element + 1 dprint('\n\n') return element except Exception as e: print("[RSS](parse_rss_file)Error: %s" % e)