def __init__(self): Scraper.__init__(self) api_key = self.config["youtube"]["api_key"] self.url = "https://www.googleapis.com/youtube/v3/search" self.params = { "order": "date", "maxResults": 10, "channelId": "UCH1dpzjCEiGAt8CXkryhkZg", "key": api_key, "type": "upload", "part": "snippet" } self.details = Bernie2016VideoDetailScraper() self.video_provider = VideoProvider() self.push_provider = PushProvider()
def __init__(self): Scraper.__init__(self) api_key = self.config["youtube"]["api_key"] self.url = "https://www.googleapis.com/youtube/v3/search" self.params = { "order": "date", "maxResults": 10, "channelId": "UCH1dpzjCEiGAt8CXkryhkZg", "key": api_key, "type": "upload", "part": "snippet", } self.details = Bernie2016VideoDetailScraper() self.video_provider = VideoProvider() self.push_provider = PushProvider()
def __init__(self): Scraper.__init__(self) c = self.config["bsd"] self.html = HTMLParser() self.call_path = "/page/api/event/search_events" self.params = { "api_ver": "2", "api_id": c["api_id"], "api_ts": str(int(time.time())) } self.signed_params = self.sign_params(c["api_secret"]) self.url = "".join( [c["endpoint"], self.call_path, "?", self.signed_params]) self.map = {"event_id": "original_id", "start_dt": "start_time"} self.event_provider = EventProvider() self.push_provider = PushProvider()
def __init__(self, url): Scraper.__init__(self) self.url = url self.html = HTMLParser() self.issue_provider = IssueProvider() self.push_provider = PushProvider()
class Bernie2016VideosScraper(Scraper): def __init__(self): Scraper.__init__(self) api_key = self.config["youtube"]["api_key"] self.url = "https://www.googleapis.com/youtube/v3/search" self.params = { "order": "date", "maxResults": 10, "channelId": "UCH1dpzjCEiGAt8CXkryhkZg", "key": api_key, "type": "upload", "part": "snippet" } self.details = Bernie2016VideoDetailScraper() self.video_provider = VideoProvider() self.push_provider = PushProvider() def translate(self, json): idJson = json["id"] snippetJson = json["snippet"] record = { "site": "youtube.com", "video_id": idJson["videoId"], "url": "https://www.youtube.com/watch?v=" + idJson["videoId"], "title": snippetJson["title"], "snippet": snippetJson["description"], "thumbnail_url": snippetJson["thumbnails"]["high"]["url"], "timestamp_publish": snippetJson["publishedAt"] } return record def go(self): r = self.get(self.url, params=self.params, result_format="json") for item in r["items"]: if item["id"]["kind"] != 'youtube#video': continue record = self.translate(item) record["description"] = self.fetch_full_description( record["video_id"]) record["title"] = record["title"].replace(" | Bernie Sanders", "") if self.video_provider.exists_by_video_id(record["video_id"]): print "found" else: print "not found" msg = "Inserting record for '{0}'." logging.info(msg.format(record["title"].encode("utf8"))) record["timestamp_creation"] = datetime.now() video = self.video_provider.create(record) # Add push record for possible notification pushing push_record = { "object_type": "video", "object_uuid": video.uuid, "title": video.title + " - new video posted by Bernie Sanders", "body": "See this new video now", "url": video.url } push = self.push_provider.create(push_record) def fetch_full_description(self, video_id): self.details.params = { "key": self.config["youtube"]["api_key"], "part": "snippet,contentDetails", "id": video_id } r = self.details.get(self.details.url, params=self.details.params, result_format="json") return r["items"][0]["snippet"]["description"]
def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/daily/" self.html = HTMLParser() self.article_provider = ArticleProvider() self.push_provider = PushProvider()
class ArticlesScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/daily/" self.html = HTMLParser() self.article_provider = ArticleProvider() self.push_provider = PushProvider() def retrieve_article(self, url): for x in range(3): r = requests.get(url) if "https://berniesanders.com" not in r.url: return r.url, False, False if r.status_code == 200: soup = BeautifulSoup(r.text) soup = self.sanitize_soup(soup) image = soup.find('meta', {'property': 'og:image'})['content'] content = soup.article paragraphs = [ self.html.unescape(self.replace_with_newlines(p)) for p in content.findAll("p") ] text = "\n\n".join(paragraphs) html = "".join([str(p) for p in content.findAll("p")]) return text, html, image return False, False, False def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "article_id": article['id'], "image_url": "", "body": "", "timestamp_publish": self.choose_publish_date(article.time["datetime"]), "site": "berniesanders.com", "lang": "en", "article_type": "DemocracyDaily", "excerpt": self.html.unescape( article.find("div", { "class": "excerpt" }).p.text), "title": self.html.unescape(article.h2.text), "article_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"] = text rec["body_markdown"] = text rec['article_type'] = "ExternalLink" elif text and html: rec["body"] = text rec['body_markdown'] = convert_markdown(html) exit(0) try: article["image_url"] except KeyError: article["image_url"] = image msg = "" if self.article_provider.exists_by_article_id(rec["article_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.article_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info( msg.format(rec["title"].encode("utf8"), str(rec["timestamp_publish"])))
class NewsScraper(Scraper): def __init__(self, url): Scraper.__init__(self) self.url = url self.html = HTMLParser() self.news_provider = NewsProvider() self.push_provider = PushProvider() def retrieve_article(self, url): for x in range(3): r = requests.get(url) if "https://berniesanders.com" not in r.url: return r.url, False, False if r.status_code == 200: soup = BeautifulSoup(r.text) soup = self.sanitize_soup(soup) image = soup.find('meta', {'property': 'og:image'})['content'] content = soup.article paragraphs = [ self.html.unescape(self.replace_with_newlines(p)) for p in content.findAll("p") ] text = "\n\n".join(paragraphs) html = "".join([str(p) for p in content.findAll("p")]) return text, html, image return False, False, False def go(self): soup = self.get(self.url) try: lang = soup.html['lang'] except KeyError as e: lang = 'en' content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "news_id": article['id'], "body": "", "image_url": "", "timestamp_publish": self.choose_publish_date(article.time["datetime"]), "site": "berniesanders.com", "lang": lang, "title": self.html.unescape(article.h2.text), "news_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] # Pull excerpt if available try: rec["excerpt"] = self.html.unescape(article.p.text) except AttributeError: rec["excerpt"] = "" # Determine Type if rec['news_category'].lower() in ["on the road", "news"]: rec['news_type'] = "News" elif rec['news_category'].lower() in [ "press release", "comunicados de prensa" ]: rec['news_type'] = "PressRelease" else: rec['news_type'] = "Unknown" text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"], rec["body_markdown"] = text, text rec['news_type'] = "ExternalLink" elif text and html: rec["body"] = text rec['body_markdown'] = convert_markdown(html) try: article["image_url"] except KeyError: article["image_url"] = image msg = "" if self.news_provider.exists_by_news_id(rec["news_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.news_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info( msg.format(rec["title"].encode("utf8"), str(rec["timestamp_publish"])))
class ArticlesScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/daily/" self.html = HTMLParser() self.article_provider = ArticleProvider() self.push_provider = PushProvider() def retrieve_article(self, url): for x in range(3): r = requests.get(url) if "https://berniesanders.com" not in r.url: return r.url, False, False if r.status_code == 200: soup = BeautifulSoup(r.text) soup = self.sanitize_soup(soup) image = soup.find('meta', {'property': 'og:image'})['content'] content = soup.article paragraphs = [self.html.unescape(self.replace_with_newlines(p)) for p in content.findAll("p")] text = "\n\n".join(paragraphs) html = "".join([str(p) for p in content.findAll("p")]) return text, html, image return False, False, False def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "article_id": article['id'], "image_url": "", "body": "", "timestamp_publish": self.choose_publish_date(article.time["datetime"]), "site": "berniesanders.com", "lang": "en", "article_type": "DemocracyDaily", "excerpt": self.html.unescape( article.find( "div", {"class": "excerpt"}).p.text), "title": self.html.unescape(article.h2.text), "article_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"] = text rec["body_markdown"] = text rec['article_type'] = "ExternalLink" elif text and html: rec["body"] = text rec['body_markdown'] = convert_markdown (html) exit(0) try: article["image_url"] except KeyError: article["image_url"] = image msg = "" if self.article_provider.exists_by_article_id(rec["article_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.article_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info(msg.format( rec["title"].encode("utf8"), str(rec["timestamp_publish"]) ))
def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/news/" self.html = HTMLParser() self.news_provider = NewsProvider() self.push_provider = PushProvider()
class NewsScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/news/" self.html = HTMLParser() self.news_provider = NewsProvider() self.push_provider = PushProvider() def retrieve_article(self, url): for x in range(3): r = requests.get(url) if "https://berniesanders.com" not in r.url: return r.url, False, False if r.status_code == 200: soup = BeautifulSoup(r.text) soup = self.sanitize_soup(soup) image = soup.find('meta', {'property': 'og:image'})['content'] content = soup.article paragraphs = [self.html.unescape(self.replace_with_newlines(p)) for p in content.findAll("p")] text = "\n\n".join(paragraphs) html = "".join([str(p) for p in content.findAll("p")]) return text, html, image return False, False, False def go(self): soup = self.get(self.url) content = soup.find("section", {"id": "content"}) for article in content.findAll("article"): rec = { "news_id": article['id'], "image_url": "", "timestamp_publish": parser.parse(article.time["datetime"]), "site": "berniesanders.com", "lang": "en", "title": self.html.unescape(article.h2.text), "news_category": self.html.unescape(article.h1.string.strip()), "url": article.h2.a["href"] } if article.img is not None: rec["image_url"] = article.img["src"] # Pull excerpt if available try: rec["excerpt_html"] = str(article.p) rec["excerpt"] = self.html.unescape(article.p.text) except AttributeError: rec["excerpt"], rec["excerpt_html"] = "", "" # Determine Type if rec['news_category'].lower() in ["on the road", "news"]: rec['news_type'] = "News" elif rec['news_category'].lower() == "press release": rec['news_type'] = "PressRelease" else: rec['news_type'] = "Unknown" text, html, image = self.retrieve_article(rec["url"]) if text and not html: rec["body"], rec["body_html"] = text, text rec['news_type'] = "ExternalLink" rec["body_html_nostyle"] = "" elif text and html: rec["body"], rec["body_html"] = text, html no_style = self.remove_style(BeautifulSoup(html)) rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")]) try: article["image_url"] except KeyError: article["image_url"] = image rec['body_markdown'] = convert_markdown (rec['body_html']) msg = "" if self.news_provider.exists_by_news_id(rec["news_id"]): print "found" else: print "not found" msg = "Inserting '{0}', created {1}" result = self.news_provider.create(rec) self.push_provider.create_by_foreign_model(result) logging.info(msg.format( rec["title"].encode("utf8"), str(rec["timestamp_publish"]) ))
@app.route('/issue/<uuid:issue_uuid>', methods=['GET', 'POST']) @auth.login_required def issue_detail(issue_uuid): issue = issue_provider.read(issue_uuid) updated = False if request.method == 'POST' and issue_provider.update(issue, request): updated = True return render_template('issue.html', issue=issue, updated=updated) if __name__ == '__main__': try: with open('/opt/bernie/config.yml', 'r') as f: conf = yaml.load(f)['flask'] except IOError: msg = "Could not open config file: {0}" logging.info(msg.format(self.configfile)) raise else: event_provider = EventProvider() issue_provider = IssueProvider() video_provider = VideoProvider() article_provider = ArticleProvider() news_provider = NewsProvider() push_provider = PushProvider() users = {conf['httpauth_username']: conf['httpauth_password']} app.run(host=conf['host'], debug=conf['debug']) register(conf['parse_application_id'], conf['parse_rest_api_key'], conf['parse_master_key']) #Push.message("Good morning", channels=["Mike Testing"])
class Bernie2016VideosScraper(Scraper): def __init__(self): Scraper.__init__(self) api_key = self.config["youtube"]["api_key"] self.url = "https://www.googleapis.com/youtube/v3/search" self.params = { "order": "date", "maxResults": 10, "channelId": "UCH1dpzjCEiGAt8CXkryhkZg", "key": api_key, "type": "upload", "part": "snippet", } self.details = Bernie2016VideoDetailScraper() self.video_provider = VideoProvider() self.push_provider = PushProvider() def translate(self, json): idJson = json["id"] snippetJson = json["snippet"] record = { "site": "youtube.com", "video_id": idJson["videoId"], "url": "https://www.youtube.com/watch?v=" + idJson["videoId"], "title": snippetJson["title"], "snippet": snippetJson["description"], "thumbnail_url": snippetJson["thumbnails"]["high"]["url"], "timestamp_publish": snippetJson["publishedAt"], } return record def go(self): r = self.get(self.url, params=self.params, result_format="json") for item in r["items"]: if item["id"]["kind"] != "youtube#video": continue record = self.translate(item) record["description"] = self.fetch_full_description(record["video_id"]) record["title"] = record["title"].replace(" | Bernie Sanders", "") if self.video_provider.exists_by_video_id(record["video_id"]): print "found" else: print "not found" msg = "Inserting record for '{0}'." logging.info(msg.format(record["title"].encode("utf8"))) record["timestamp_creation"] = datetime.now() video = self.video_provider.create(record) # Add push record for possible notification pushing push_record = { "object_type": "video", "object_uuid": video.uuid, "title": video.title + " - new video posted by Bernie Sanders", "body": "See this new video now", "url": video.url, } push = self.push_provider.create(push_record) def fetch_full_description(self, video_id): self.details.params = { "key": self.config["youtube"]["api_key"], "part": "snippet,contentDetails", "id": video_id, } r = self.details.get(self.details.url, params=self.details.params, result_format="json") return r["items"][0]["snippet"]["description"]
def __init__(self, url): Scraper.__init__(self) self.url = url self.html = HTMLParser() self.news_provider = NewsProvider() self.push_provider = PushProvider()
class IssuesScraper(Scraper): def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/issues/feed/" self.html = HTMLParser() self.issue_provider = IssueProvider() self.push_provider = PushProvider() def collect_urls(self): records = [] items = self.get(self.url).findAll("item") for item in items: record = { "title": self.html.unescape(item.title.text), "timestamp_publish": parser.parse(item.pubdate.text), "site": "berniesanders.com", "lang": "en", "description_html": item.description.text, "description": self.html.unescape( BeautifulSoup(item.description.text).p.text), "url": item.link.nextSibling } records.append(record) return records def retrieve(self, record): soup = self.get(record["url"]) # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/> meta_image = soup.findAll(attrs={"property":"og:image"}) record["image_url"] = meta_image[0]["content"].encode('utf8') # reset soup to content soup = self.sanitize_soup(soup.find("section", {"id": "content"})) while soup.article.style is not None: soup.article.style.extract() record["body_html"] = str(soup.article) text = [] for elem in soup.article.recursiveChildGenerator(): if isinstance(elem, types.StringTypes): text.append(self.html.unescape(elem.strip())) elif elem.name == 'br': text.append("") record["body"] = "\n".join(text) record['body_markdown'] = convert_markdown (record['body_html']) return record def go(self): urls = self.collect_urls() if not urls: logging.critical("Could not retrieve issues.") sys.exit(1) for url in urls: record = self.retrieve(url) if self.issue_provider.exists_by_url(record["url"]): print "found" else: msg = "Inserting record for '{0}'." logging.info(msg.format(record["title"].encode("utf8"))) record["timestamp_creation"] = datetime.now() result = self.issue_provider.create(record) self.push_provider.create_by_foreign_model(result)
def __init__(self): Scraper.__init__(self) self.url = "https://berniesanders.com/issues/feed/" self.html = HTMLParser() self.issue_provider = IssueProvider() self.push_provider = PushProvider()