Beispiel #1
0
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/daily/"
     self.html = HTMLParser()
     self.article_provider = ArticleProvider()
Beispiel #2
0
class ArticlesScraper(Scraper):
    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/daily/"
        self.html = HTMLParser()
        self.article_provider = ArticleProvider()

    def retrieve_article(self, url):
        for x in range(3):
            r = requests.get(url)
            if "https://berniesanders.com" not in r.url:
                return r.url, False, False
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                soup = self.sanitize_soup(soup)
                image = soup.find('meta', {'property': 'og:image'})['content']
                content = soup.article
                paragraphs = [
                    self.html.unescape(self.replace_with_newlines(p))
                    for p in content.findAll("p")
                ]
                text = "\n\n".join(paragraphs)
                html = "".join([str(p) for p in content.findAll("p")])
                return text, html, image
        return False, False, False

    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):

            rec = {
                "article_id":
                article['id'],
                "image_url":
                "",
                "timestamp_publish":
                parser.parse(article.time["datetime"]),
                "site":
                "berniesanders.com",
                "lang":
                "en",
                "article_type":
                "DemocracyDaily",
                "excerpt_html":
                str(article.find("div", {
                    "class": "excerpt"
                }).p),
                "excerpt":
                self.html.unescape(
                    article.find("div", {
                        "class": "excerpt"
                    }).p.text),
                "title":
                self.html.unescape(article.h2.text),
                "article_category":
                self.html.unescape(article.h1.string.strip()),
                "url":
                article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"], rec["body_html"] = text, text
                rec['article_type'] = "ExternalLink"
                rec["body_html_nostyle"] = ""
            elif text and html:
                rec["body"], rec["body_html"] = text, html

                no_style = self.remove_style(BeautifulSoup(html))
                rec["body_html_nostyle"] = "".join(
                    [str(p) for p in no_style.findAll("p")])

                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            msg = ""
            if self.article_provider.exists_by_article_id(rec["article_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                self.article_provider.create(rec)

            logging.info(
                msg.format(rec["title"].encode("utf8"),
                           str(rec["timestamp_publish"])))
Beispiel #3
0
class ArticlesScraper(Scraper):

    def __init__(self):
        Scraper.__init__(self)
        self.url = "https://berniesanders.com/daily/"
        self.html = HTMLParser()
        self.article_provider = ArticleProvider()

    def retrieve_article(self, url):
        for x in range(3):
            r = requests.get(url)
            if "https://berniesanders.com" not in r.url:
                return r.url, False, False
            if r.status_code == 200:
                soup = BeautifulSoup(r.text)
                soup = self.sanitize_soup(soup)
                image = soup.find('meta', {'property': 'og:image'})['content']
                content = soup.article
                paragraphs = [self.html.unescape(self.replace_with_newlines(p))
                              for p in content.findAll("p")]
                text = "\n\n".join(paragraphs)
                html = "".join([str(p) for p in content.findAll("p")])
                return text, html, image
        return False, False, False

    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):
            
            rec = {
            	"article_id": article['id'],
                "image_url": "",
                "timestamp_publish": parser.parse(article.time["datetime"]),
                "site": "berniesanders.com",
                "lang": "en",
                "article_type": "DemocracyDaily",
                "excerpt_html": str(article.find(
                    "div", {"class": "excerpt"}).p),
                "excerpt": self.html.unescape(
                    article.find(
                        "div", {"class": "excerpt"}).p.text),
                "title": self.html.unescape(article.h2.text),
                "article_category": self.html.unescape(article.h1.string.strip()),
                "url": article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"], rec["body_html"] = text, text
                rec['article_type'] = "ExternalLink"
                rec["body_html_nostyle"] = ""
            elif text and html:
                rec["body"], rec["body_html"] = text, html

                no_style = self.remove_style(BeautifulSoup(html))
                rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")])

                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            msg = ""
            if self.article_provider.exists_by_article_id(rec["article_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                self.article_provider.create(rec)

            logging.info(msg.format(
                rec["title"].encode("utf8"),
                str(rec["timestamp_publish"])
            ))
Beispiel #4
0
 def __init__(self):
     Scraper.__init__(self)
     self.url = "https://berniesanders.com/daily/"
     self.html = HTMLParser()
     self.article_provider = ArticleProvider()
Beispiel #5
0
@app.route('/issue/<uuid:issue_uuid>', methods=['GET', 'POST'])
@auth.login_required
def issue_detail(issue_uuid):
    issue = issue_provider.read(issue_uuid)
    updated = False
    if request.method == 'POST' and issue_provider.update(issue, request):
        updated = True
    return render_template('issue.html', issue=issue, updated=updated)


if __name__ == '__main__':
    try:
        with open('/opt/bernie/config.yml', 'r') as f:
            conf = yaml.load(f)['flask']
    except IOError:
        msg = "Could not open config file: {0}"
        logging.info(msg.format(self.configfile))
        raise
    else:
        event_provider = EventProvider()
        issue_provider = IssueProvider()
        video_provider = VideoProvider()
        article_provider = ArticleProvider()
        news_provider = NewsProvider()
        push_provider = PushProvider()
        users = {conf['httpauth_username']: conf['httpauth_password']}
        app.run(host=conf['host'], debug=conf['debug'])
        register(conf['parse_application_id'], conf['parse_rest_api_key'],
                 conf['parse_master_key'])
        #Push.message("Good morning", channels=["Mike Testing"])