コード例 #1
0
 def __init__(self):
     Scraper.__init__(self)
     api_key = self.config["youtube"]["api_key"]
     self.url = "https://www.googleapis.com/youtube/v3/search"
     self.params = {
         "order": "date",
         "maxResults": 10,
         "channelId": "UCH1dpzjCEiGAt8CXkryhkZg",
         "key": api_key,
         "type": "upload",
         "part": "snippet"
     }
     self.details = Bernie2016VideoDetailScraper()
     self.video_provider = VideoProvider()
     self.push_provider = PushProvider()
コード例 #2
0
ファイル: bernie_2016.py プロジェクト: nymd/movement-cms
 def __init__(self):
     Scraper.__init__(self)
     api_key = self.config["youtube"]["api_key"]
     self.url = "https://www.googleapis.com/youtube/v3/search"
     self.params = {
         "order": "date",
         "maxResults": 10,
         "channelId": "UCH1dpzjCEiGAt8CXkryhkZg",
         "key": api_key,
         "type": "upload",
         "part": "snippet",
     }
     self.details = Bernie2016VideoDetailScraper()
     self.video_provider = VideoProvider()
     self.push_provider = PushProvider()
コード例 #3
0
ファイル: events.py プロジェクト: prejr-dev/Connect-Sharknado
 def __init__(self):
     Scraper.__init__(self)
     c = self.config["bsd"]
     self.html = HTMLParser()
     self.call_path = "/page/api/event/search_events"
     self.params = {
         "api_ver": "2",
         "api_id": c["api_id"],
         "api_ts": str(int(time.time()))
     }
     self.signed_params = self.sign_params(c["api_secret"])
     self.url = "".join(
         [c["endpoint"], self.call_path, "?", self.signed_params])
     self.map = {"event_id": "original_id", "start_dt": "start_time"}
     self.event_provider = EventProvider()
     self.push_provider = PushProvider()
コード例 #4
0
 def __init__(self, url):
     Scraper.__init__(self)
     self.url = url
     self.html = HTMLParser()
     self.issue_provider = IssueProvider()
     self.push_provider = PushProvider()
コード例 #5
0
class Bernie2016VideosScraper(Scraper):
    def __init__(self):
        Scraper.__init__(self)
        api_key = self.config["youtube"]["api_key"]
        self.url = "https://www.googleapis.com/youtube/v3/search"
        self.params = {
            "order": "date",
            "maxResults": 10,
            "channelId": "UCH1dpzjCEiGAt8CXkryhkZg",
            "key": api_key,
            "type": "upload",
            "part": "snippet"
        }
        self.details = Bernie2016VideoDetailScraper()
        self.video_provider = VideoProvider()
        self.push_provider = PushProvider()

    def translate(self, json):
        idJson = json["id"]
        snippetJson = json["snippet"]

        record = {
            "site": "youtube.com",
            "video_id": idJson["videoId"],
            "url": "https://www.youtube.com/watch?v=" + idJson["videoId"],
            "title": snippetJson["title"],
            "snippet": snippetJson["description"],
            "thumbnail_url": snippetJson["thumbnails"]["high"]["url"],
            "timestamp_publish": snippetJson["publishedAt"]
        }
        return record

    def go(self):
        r = self.get(self.url, params=self.params, result_format="json")
        for item in r["items"]:
            if item["id"]["kind"] != 'youtube#video':
                continue
            record = self.translate(item)
            record["description"] = self.fetch_full_description(
                record["video_id"])
            record["title"] = record["title"].replace(" | Bernie Sanders", "")

            if self.video_provider.exists_by_video_id(record["video_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting record for '{0}'."
                logging.info(msg.format(record["title"].encode("utf8")))
                record["timestamp_creation"] = datetime.now()
                video = self.video_provider.create(record)

                # Add push record for possible notification pushing
                push_record = {
                    "object_type": "video",
                    "object_uuid": video.uuid,
                    "title":
                    video.title + " - new video posted by Bernie Sanders",
                    "body": "See this new video now",
                    "url": video.url
                }
                push = self.push_provider.create(push_record)

    def fetch_full_description(self, video_id):
        self.details.params = {
            "key": self.config["youtube"]["api_key"],
            "part": "snippet,contentDetails",
            "id": video_id
        }
        r = self.details.get(self.details.url,
                             params=self.details.params,
                             result_format="json")
        return r["items"][0]["snippet"]["description"]
コード例 #6
0
ファイル: articles.py プロジェクト: jgusta/movement
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/daily/"
     self.html = HTMLParser()
     self.article_provider = ArticleProvider()
     self.push_provider = PushProvider()
コード例 #7
0
ファイル: articles.py プロジェクト: jgusta/movement
class ArticlesScraper(Scraper):
    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/daily/"
        self.html = HTMLParser()
        self.article_provider = ArticleProvider()
        self.push_provider = PushProvider()

    def retrieve_article(self, url):
        for x in range(3):
            r = requests.get(url)
            if "https://berniesanders.com" not in r.url:
                return r.url, False, False
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                soup = self.sanitize_soup(soup)
                image = soup.find('meta', {'property': 'og:image'})['content']
                content = soup.article
                paragraphs = [
                    self.html.unescape(self.replace_with_newlines(p))
                    for p in content.findAll("p")
                ]
                text = "\n\n".join(paragraphs)
                html = "".join([str(p) for p in content.findAll("p")])
                return text, html, image
        return False, False, False

    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):

            rec = {
                "article_id":
                article['id'],
                "image_url":
                "",
                "body":
                "",
                "timestamp_publish":
                self.choose_publish_date(article.time["datetime"]),
                "site":
                "berniesanders.com",
                "lang":
                "en",
                "article_type":
                "DemocracyDaily",
                "excerpt":
                self.html.unescape(
                    article.find("div", {
                        "class": "excerpt"
                    }).p.text),
                "title":
                self.html.unescape(article.h2.text),
                "article_category":
                self.html.unescape(article.h1.string.strip()),
                "url":
                article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"] = text
                rec["body_markdown"] = text
                rec['article_type'] = "ExternalLink"
            elif text and html:
                rec["body"] = text
                rec['body_markdown'] = convert_markdown(html)
                exit(0)
                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            msg = ""
            if self.article_provider.exists_by_article_id(rec["article_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.article_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(
                msg.format(rec["title"].encode("utf8"),
                           str(rec["timestamp_publish"])))
コード例 #8
0
class NewsScraper(Scraper):
    def __init__(self, url):
        Scraper.__init__(self)
        self.url = url
        self.html = HTMLParser()
        self.news_provider = NewsProvider()
        self.push_provider = PushProvider()

    def retrieve_article(self, url):
        for x in range(3):
            r = requests.get(url)
            if "https://berniesanders.com" not in r.url:
                return r.url, False, False
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                soup = self.sanitize_soup(soup)
                image = soup.find('meta', {'property': 'og:image'})['content']
                content = soup.article
                paragraphs = [
                    self.html.unescape(self.replace_with_newlines(p))
                    for p in content.findAll("p")
                ]
                text = "\n\n".join(paragraphs)
                html = "".join([str(p) for p in content.findAll("p")])
                return text, html, image
        return False, False, False

    def go(self):
        soup = self.get(self.url)
        try:
            lang = soup.html['lang']
        except KeyError as e:
            lang = 'en'
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):
            rec = {
                "news_id":
                article['id'],
                "body":
                "",
                "image_url":
                "",
                "timestamp_publish":
                self.choose_publish_date(article.time["datetime"]),
                "site":
                "berniesanders.com",
                "lang":
                lang,
                "title":
                self.html.unescape(article.h2.text),
                "news_category":
                self.html.unescape(article.h1.string.strip()),
                "url":
                article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            # Pull excerpt if available
            try:
                rec["excerpt"] = self.html.unescape(article.p.text)
            except AttributeError:
                rec["excerpt"] = ""

            # Determine Type
            if rec['news_category'].lower() in ["on the road", "news"]:
                rec['news_type'] = "News"
            elif rec['news_category'].lower() in [
                    "press release", "comunicados de prensa"
            ]:
                rec['news_type'] = "PressRelease"
            else:
                rec['news_type'] = "Unknown"

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"], rec["body_markdown"] = text, text
                rec['news_type'] = "ExternalLink"
            elif text and html:
                rec["body"] = text
                rec['body_markdown'] = convert_markdown(html)
                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            msg = ""
            if self.news_provider.exists_by_news_id(rec["news_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.news_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(
                msg.format(rec["title"].encode("utf8"),
                           str(rec["timestamp_publish"])))
コード例 #9
0
class ArticlesScraper(Scraper):

    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/daily/"
        self.html = HTMLParser()
        self.article_provider = ArticleProvider()
        self.push_provider = PushProvider()

    def retrieve_article(self, url):
        for x in range(3):
            r = requests.get(url)
            if "https://berniesanders.com" not in r.url:
                return r.url, False, False
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                soup = self.sanitize_soup(soup)
                image = soup.find('meta', {'property': 'og:image'})['content']
                content = soup.article
                paragraphs = [self.html.unescape(self.replace_with_newlines(p))
                              for p in content.findAll("p")]
                text = "\n\n".join(paragraphs)
                html = "".join([str(p) for p in content.findAll("p")])
                return text, html, image
        return False, False, False

    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):

            rec = {
            	"article_id": article['id'],
                "image_url": "",
                "body": "",
                "timestamp_publish": self.choose_publish_date(article.time["datetime"]),
                "site": "berniesanders.com",
                "lang": "en",
                "article_type": "DemocracyDaily",
                "excerpt": self.html.unescape(
                    article.find(
                        "div", {"class": "excerpt"}).p.text),
                "title": self.html.unescape(article.h2.text),
                "article_category": self.html.unescape(article.h1.string.strip()),
                "url": article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"] = text
                rec["body_markdown"] = text
                rec['article_type'] = "ExternalLink"
            elif text and html:
                rec["body"] = text
                rec['body_markdown'] = convert_markdown (html)
                exit(0)
                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image


            msg = ""
            if self.article_provider.exists_by_article_id(rec["article_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.article_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(msg.format(
                rec["title"].encode("utf8"),
                str(rec["timestamp_publish"])
            ))
コード例 #10
0
ファイル: news.py プロジェクト: chrismshelton/movement
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/news/"
     self.html = HTMLParser()
     self.news_provider = NewsProvider()
     self.push_provider = PushProvider()
コード例 #11
0
ファイル: news.py プロジェクト: chrismshelton/movement
class NewsScraper(Scraper):

    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/news/"
        self.html = HTMLParser()
        self.news_provider = NewsProvider()
        self.push_provider = PushProvider()

    def retrieve_article(self, url):
        for x in range(3):
            r = requests.get(url)
            if "https://berniesanders.com" not in r.url:
                return r.url, False, False
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                soup = self.sanitize_soup(soup)
                image = soup.find('meta', {'property': 'og:image'})['content']
                content = soup.article
                paragraphs = [self.html.unescape(self.replace_with_newlines(p))
                              for p in content.findAll("p")]
                text = "\n\n".join(paragraphs)
                html = "".join([str(p) for p in content.findAll("p")])
                return text, html, image
        return False, False, False

    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):
            rec = {
                "news_id": article['id'],
                "image_url": "",
                "timestamp_publish": parser.parse(article.time["datetime"]),
                "site": "berniesanders.com",
                "lang": "en",
                "title": self.html.unescape(article.h2.text),
                "news_category": self.html.unescape(article.h1.string.strip()),
                "url": article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            # Pull excerpt if available
            try:
                rec["excerpt_html"] = str(article.p)
                rec["excerpt"] = self.html.unescape(article.p.text)
            except AttributeError:
                rec["excerpt"], rec["excerpt_html"] = "", ""

            # Determine Type
            if rec['news_category'].lower() in ["on the road", "news"]:
                rec['news_type'] = "News"
            elif rec['news_category'].lower() == "press release":
                rec['news_type'] = "PressRelease"
            else:
                rec['news_type'] = "Unknown"

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"], rec["body_html"] = text, text
                rec['news_type'] = "ExternalLink"
                rec["body_html_nostyle"] = ""
            elif text and html:
                rec["body"], rec["body_html"] = text, html

                no_style = self.remove_style(BeautifulSoup(html))
                rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")])

                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            rec['body_markdown'] = convert_markdown (rec['body_html'])

            msg = ""
            if self.news_provider.exists_by_news_id(rec["news_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.news_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(msg.format(
                rec["title"].encode("utf8"),
                str(rec["timestamp_publish"])
            ))
コード例 #12
0
@app.route('/issue/<uuid:issue_uuid>', methods=['GET', 'POST'])
@auth.login_required
def issue_detail(issue_uuid):
    issue = issue_provider.read(issue_uuid)
    updated = False
    if request.method == 'POST' and issue_provider.update(issue, request):
        updated = True
    return render_template('issue.html', issue=issue, updated=updated)


if __name__ == '__main__':
    try:
        with open('/opt/bernie/config.yml', 'r') as f:
            conf = yaml.load(f)['flask']
    except IOError:
        msg = "Could not open config file: {0}"
        logging.info(msg.format(self.configfile))
        raise
    else:
        event_provider = EventProvider()
        issue_provider = IssueProvider()
        video_provider = VideoProvider()
        article_provider = ArticleProvider()
        news_provider = NewsProvider()
        push_provider = PushProvider()
        users = {conf['httpauth_username']: conf['httpauth_password']}
        app.run(host=conf['host'], debug=conf['debug'])
        register(conf['parse_application_id'], conf['parse_rest_api_key'],
                 conf['parse_master_key'])
        #Push.message("Good morning", channels=["Mike Testing"])
コード例 #13
0
ファイル: bernie_2016.py プロジェクト: nymd/movement-cms
class Bernie2016VideosScraper(Scraper):
    def __init__(self):
        Scraper.__init__(self)
        api_key = self.config["youtube"]["api_key"]
        self.url = "https://www.googleapis.com/youtube/v3/search"
        self.params = {
            "order": "date",
            "maxResults": 10,
            "channelId": "UCH1dpzjCEiGAt8CXkryhkZg",
            "key": api_key,
            "type": "upload",
            "part": "snippet",
        }
        self.details = Bernie2016VideoDetailScraper()
        self.video_provider = VideoProvider()
        self.push_provider = PushProvider()

    def translate(self, json):
        idJson = json["id"]
        snippetJson = json["snippet"]

        record = {
            "site": "youtube.com",
            "video_id": idJson["videoId"],
            "url": "https://www.youtube.com/watch?v=" + idJson["videoId"],
            "title": snippetJson["title"],
            "snippet": snippetJson["description"],
            "thumbnail_url": snippetJson["thumbnails"]["high"]["url"],
            "timestamp_publish": snippetJson["publishedAt"],
        }
        return record

    def go(self):
        r = self.get(self.url, params=self.params, result_format="json")
        for item in r["items"]:
            if item["id"]["kind"] != "youtube#video":
                continue
            record = self.translate(item)
            record["description"] = self.fetch_full_description(record["video_id"])
            record["title"] = record["title"].replace(" | Bernie Sanders", "")

            if self.video_provider.exists_by_video_id(record["video_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting record for '{0}'."
                logging.info(msg.format(record["title"].encode("utf8")))
                record["timestamp_creation"] = datetime.now()
                video = self.video_provider.create(record)

                # Add push record for possible notification pushing
                push_record = {
                    "object_type": "video",
                    "object_uuid": video.uuid,
                    "title": video.title + " - new video posted by Bernie Sanders",
                    "body": "See this new video now",
                    "url": video.url,
                }
                push = self.push_provider.create(push_record)

    def fetch_full_description(self, video_id):
        self.details.params = {
            "key": self.config["youtube"]["api_key"],
            "part": "snippet,contentDetails",
            "id": video_id,
        }
        r = self.details.get(self.details.url, params=self.details.params, result_format="json")
        return r["items"][0]["snippet"]["description"]
コード例 #14
0
 def __init__(self, url):
     Scraper.__init__(self)
     self.url = url
     self.html = HTMLParser()
     self.news_provider = NewsProvider()
     self.push_provider = PushProvider()
コード例 #15
0
ファイル: issues.py プロジェクト: chrismshelton/movement
class IssuesScraper(Scraper):

    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/issues/feed/"
        self.html = HTMLParser()
        self.issue_provider = IssueProvider()
        self.push_provider = PushProvider()

    def collect_urls(self):
        records = []
        items = self.get(self.url).findAll("item")
        for item in items:
            record = {
                "title": self.html.unescape(item.title.text),
                "timestamp_publish": parser.parse(item.pubdate.text),
                "site": "berniesanders.com",
                "lang": "en",
                "description_html": item.description.text,
                "description": self.html.unescape(
                    BeautifulSoup(item.description.text).p.text),
                "url": item.link.nextSibling
            }
            records.append(record)
        return records

    def retrieve(self, record):

        soup = self.get(record["url"])

        # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/>
        meta_image = soup.findAll(attrs={"property":"og:image"})
        record["image_url"] = meta_image[0]["content"].encode('utf8')

        # reset soup to content     
        soup = self.sanitize_soup(soup.find("section", {"id": "content"}))
        while soup.article.style is not None:
            soup.article.style.extract()
        record["body_html"] = str(soup.article)
        text = []
        for elem in soup.article.recursiveChildGenerator():
            if isinstance(elem, types.StringTypes):
                text.append(self.html.unescape(elem.strip()))
            elif elem.name == 'br':
                text.append("")
        record["body"] = "\n".join(text)
        record['body_markdown'] = convert_markdown (record['body_html'])

        return record

    def go(self):
        urls = self.collect_urls()
        if not urls:
            logging.critical("Could not retrieve issues.")
            sys.exit(1)
        for url in urls:
            record = self.retrieve(url)
            if self.issue_provider.exists_by_url(record["url"]):
                print "found"
            else:
                msg = "Inserting record for '{0}'."
                logging.info(msg.format(record["title"].encode("utf8")))
                record["timestamp_creation"] = datetime.now()
                result = self.issue_provider.create(record)
                self.push_provider.create_by_foreign_model(result)
コード例 #16
0
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/daily/"
     self.html = HTMLParser()
     self.article_provider = ArticleProvider()
     self.push_provider = PushProvider()
コード例 #17
0
ファイル: issues.py プロジェクト: chrismshelton/movement
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/issues/feed/"
     self.html = HTMLParser()
     self.issue_provider = IssueProvider()
     self.push_provider = PushProvider()
コード例 #18
0
ファイル: news.py プロジェクト: Bernie-2016/Connect-Sharknado
 def __init__(self, url):
     Scraper.__init__(self)
     self.url = url
     self.html = HTMLParser()
     self.news_provider = NewsProvider()
     self.push_provider = PushProvider()