def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/news/" self.html = HTMLParser() self.news_provider = NewsProvider()
class NewsScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/news/" self.html = HTMLParser() self.news_provider = NewsProvider() def retrieve_article(self, url): for x in range(3): r = requests.get(url) if "https://berniesanders.com" not in r.url: return r.url, False, False if r.status_code == 200: soup = BeautifulSoup(r.text) soup = self.sanitize_soup(soup) image = soup.find('meta', {'property': 'og:image'})['content'] content = soup.article paragraphs = [self.html.unescape(self.replace_with_newlines(p)) for p in content.findAll("p")] text = "\n\n".join(paragraphs) html = "".join([str(p) for p in content.findAll("p")]) return text, html, image return False, False, False def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "news_id": article['id'], "image_url": "", "timestamp_publish": parser.parse(article.time["datetime"]), "site": "berniesanders.com", "lang": "en", "title": self.html.unescape(article.h2.text), "news_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] # Pull excerpt if available try: rec["excerpt_html"] = str(article.p) rec["excerpt"] = self.html.unescape(article.p.text) except AttributeError: rec["excerpt"], rec["excerpt_html"] = "", "" # Determine Type if rec['news_category'].lower() in ["on the road", "news"]: rec['news_type'] = "News" elif rec['news_category'].lower() == "press release": rec['news_type'] = "PressRelease" else: rec['news_type'] = "Unknown" text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"], rec["body_html"] = text, text rec['news_type'] = "ExternalLink" rec["body_html_nostyle"] = "" elif text and html: rec["body"], rec["body_html"] = text, html no_style = self.remove_style(BeautifulSoup(html)) rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")]) try: article["image_url"] except KeyError: article["image_url"] = image msg = "" if self.news_provider.exists_by_news_id(rec["news_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" self.news_provider.create(rec) logging.info(msg.format( rec["title"].encode("utf8"), str(rec["timestamp_publish"]) ))
def __init__(self, url): Scraper.__init__(self) self.url = url self.html = HTMLParser() self.news_provider = NewsProvider() self.push_provider = PushProvider()
@app.route('/issue/<uuid:issue_uuid>', methods=['GET', 'POST']) @auth.login_required def issue_detail(issue_uuid): issue = issue_provider.read(issue_uuid) updated = False if request.method == 'POST' and issue_provider.update(issue, request): updated = True return render_template('issue.html', issue=issue, updated=updated) if __name__ == '__main__': try: with open('/opt/bernie/config.yml', 'r') as f: conf = yaml.load(f)['flask'] except IOError: msg = "Could not open config file: {0}" logging.info(msg.format(self.configfile)) raise else: event_provider = EventProvider() issue_provider = IssueProvider() video_provider = VideoProvider() article_provider = ArticleProvider() news_provider = NewsProvider() push_provider = PushProvider() users = {conf['httpauth_username']: conf['httpauth_password']} app.run(host=conf['host'], debug=conf['debug']) register(conf['parse_application_id'], conf['parse_rest_api_key'], conf['parse_master_key']) #Push.message("Good morning", channels=["Mike Testing"])