def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "article_id": article['id'], "image_url": "", "timestamp_publish": parser.parse(article.time["datetime"]), "site": "berniesanders.com", "lang": "en", "article_type": "DemocracyDaily", "excerpt_html": str(article.find( "div", {"class": "excerpt"}).p), "excerpt": self.html.unescape( article.find( "div", {"class": "excerpt"}).p.text), "title": self.html.unescape(article.h2.text), "article_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"], rec["body_html"] = text, text rec['article_type'] = "ExternalLink" rec["body_html_nostyle"] = "" elif text and html: rec["body"], rec["body_html"] = text, html no_style = self.remove_style(BeautifulSoup(html)) rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")]) try: article["image_url"] except KeyError: article["image_url"] = image rec['body_markdown'] = convert_markdown (rec['body_html']) msg = "" if self.article_provider.exists_by_article_id(rec["article_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.article_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info(msg.format( rec["title"].encode("utf8"), str(rec["timestamp_publish"]) ))
def retrieve(self, record): soup = self.get(record["url"]) # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/> meta_image = soup.findAll(attrs={"property":"og:image"}) record["image_url"] = meta_image[0]["content"].encode('utf8') # reset soup to content soup = self.sanitize_soup(soup.find("section", {"id": "content"})) while soup.article.style is not None: soup.article.style.extract() text = [] for elem in soup.article.recursiveChildGenerator(): if isinstance(elem, types.StringTypes): text.append(self.html.unescape(elem.strip())) elif elem.name == 'br': text.append("") record["body"] = "\n".join(text) record['body_markdown'] = convert_markdown (str(soup.article)) return record
def retrieve(self, record): soup = self.get(record["url"]) # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/> meta_image = soup.findAll(attrs={"property": "og:image"}) record["image_url"] = meta_image[0]["content"].encode('utf8') # reset soup to content soup = self.sanitize_soup(soup.find("section", {"id": "content"})) while soup.article.style is not None: soup.article.style.extract() text = [] for elem in soup.article.recursiveChildGenerator(): if isinstance(elem, types.StringTypes): text.append(self.html.unescape(elem.strip())) elif elem.name == 'br': text.append("") record["body"] = "\n".join(text) record['body_markdown'] = convert_markdown(str(soup.article)) return record
def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "article_id": article['id'], "image_url": "", "body": "", "timestamp_publish": self.choose_publish_date(article.time["datetime"]), "site": "berniesanders.com", "lang": "en", "article_type": "DemocracyDaily", "excerpt": self.html.unescape( article.find("div", { "class": "excerpt" }).p.text), "title": self.html.unescape(article.h2.text), "article_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"] = text rec["body_markdown"] = text rec['article_type'] = "ExternalLink" elif text and html: rec["body"] = text rec['body_markdown'] = convert_markdown(html) exit(0) try: article["image_url"] except KeyError: article["image_url"] = image msg = "" if self.article_provider.exists_by_article_id(rec["article_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.article_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info( msg.format(rec["title"].encode("utf8"), str(rec["timestamp_publish"])))
def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "news_id": article['id'], "image_url": "", "timestamp_publish": parser.parse(article.time["datetime"]), "site": "berniesanders.com", "lang": "en", "title": self.html.unescape(article.h2.text), "news_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] # Pull excerpt if available try: rec["excerpt_html"] = str(article.p) rec["excerpt"] = self.html.unescape(article.p.text) except AttributeError: rec["excerpt"], rec["excerpt_html"] = "", "" # Determine Type if rec['news_category'].lower() in ["on the road", "news"]: rec['news_type'] = "News" elif rec['news_category'].lower() == "press release": rec['news_type'] = "PressRelease" else: rec['news_type'] = "Unknown" text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"], rec["body_html"] = text, text rec['news_type'] = "ExternalLink" rec["body_html_nostyle"] = "" elif text and html: rec["body"], rec["body_html"] = text, html no_style = self.remove_style(BeautifulSoup(html)) rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")]) try: article["image_url"] except KeyError: article["image_url"] = image rec['body_markdown'] = convert_markdown (rec['body_html']) msg = "" if self.news_provider.exists_by_news_id(rec["news_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.news_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info(msg.format( rec["title"].encode("utf8"), str(rec["timestamp_publish"]) ))
def go(self): soup = self.get(self.url) try: lang = soup.html['lang'] except KeyError as e: lang = 'en' content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "news_id": article['id'], "body": "", "image_url": "", "timestamp_publish": self.choose_publish_date(article.time["datetime"]), "site": "berniesanders.com", "lang": lang, "title": self.html.unescape(article.h2.text), "news_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] # Pull excerpt if available try: rec["excerpt"] = self.html.unescape(article.p.text) except AttributeError: rec["excerpt"] = "" # Determine Type if rec['news_category'].lower() in ["on the road", "news"]: rec['news_type'] = "News" elif rec['news_category'].lower() in [ "press release", "comunicados de prensa" ]: rec['news_type'] = "PressRelease" else: rec['news_type'] = "Unknown" text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"], rec["body_markdown"] = text, text rec['news_type'] = "ExternalLink" elif text and html: rec["body"] = text rec['body_markdown'] = convert_markdown(html) try: article["image_url"] except KeyError: article["image_url"] = image msg = "" if self.news_provider.exists_by_news_id(rec["news_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.news_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info( msg.format(rec["title"].encode("utf8"), str(rec["timestamp_publish"])))