Beispiel #1
0
    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):
            
            rec = {
            	"article_id": article['id'],
                "image_url": "",
                "timestamp_publish": parser.parse(article.time["datetime"]),
                "site": "berniesanders.com",
                "lang": "en",
                "article_type": "DemocracyDaily",
                "excerpt_html": str(article.find(
                    "div", {"class": "excerpt"}).p),
                "excerpt": self.html.unescape(
                    article.find(
                        "div", {"class": "excerpt"}).p.text),
                "title": self.html.unescape(article.h2.text),
                "article_category": self.html.unescape(article.h1.string.strip()),
                "url": article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"], rec["body_html"] = text, text
                rec['article_type'] = "ExternalLink"
                rec["body_html_nostyle"] = ""
            elif text and html:
                rec["body"], rec["body_html"] = text, html

                no_style = self.remove_style(BeautifulSoup(html))
                rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")])

                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            rec['body_markdown'] = convert_markdown (rec['body_html'])

            msg = ""
            if self.article_provider.exists_by_article_id(rec["article_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.article_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(msg.format(
                rec["title"].encode("utf8"),
                str(rec["timestamp_publish"])
            ))
    def retrieve(self, record):

        soup = self.get(record["url"])

        # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/>
        meta_image = soup.findAll(attrs={"property":"og:image"})
        record["image_url"] = meta_image[0]["content"].encode('utf8')

        # reset soup to content
        soup = self.sanitize_soup(soup.find("section", {"id": "content"}))
        while soup.article.style is not None:
            soup.article.style.extract()
        text = []
        for elem in soup.article.recursiveChildGenerator():
            if isinstance(elem, types.StringTypes):
                text.append(self.html.unescape(elem.strip()))
            elif elem.name == 'br':
                text.append("")
        record["body"] = "\n".join(text)
        record['body_markdown'] = convert_markdown (str(soup.article))

        return record
Beispiel #3
0
    def retrieve(self, record):

        soup = self.get(record["url"])

        # retrieve image from <meta property="og:image" content="https://berniesanders.com/wp-content/uploads/2015/07/072615_Bernie_NewOrleans-4382.jpg"/>
        meta_image = soup.findAll(attrs={"property": "og:image"})
        record["image_url"] = meta_image[0]["content"].encode('utf8')

        # reset soup to content
        soup = self.sanitize_soup(soup.find("section", {"id": "content"}))
        while soup.article.style is not None:
            soup.article.style.extract()
        text = []
        for elem in soup.article.recursiveChildGenerator():
            if isinstance(elem, types.StringTypes):
                text.append(self.html.unescape(elem.strip()))
            elif elem.name == 'br':
                text.append("")
        record["body"] = "\n".join(text)
        record['body_markdown'] = convert_markdown(str(soup.article))

        return record
Beispiel #4
0
    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):

            rec = {
                "article_id":
                article['id'],
                "image_url":
                "",
                "body":
                "",
                "timestamp_publish":
                self.choose_publish_date(article.time["datetime"]),
                "site":
                "berniesanders.com",
                "lang":
                "en",
                "article_type":
                "DemocracyDaily",
                "excerpt":
                self.html.unescape(
                    article.find("div", {
                        "class": "excerpt"
                    }).p.text),
                "title":
                self.html.unescape(article.h2.text),
                "article_category":
                self.html.unescape(article.h1.string.strip()),
                "url":
                article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"] = text
                rec["body_markdown"] = text
                rec['article_type'] = "ExternalLink"
            elif text and html:
                rec["body"] = text
                rec['body_markdown'] = convert_markdown(html)
                exit(0)
                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            msg = ""
            if self.article_provider.exists_by_article_id(rec["article_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.article_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(
                msg.format(rec["title"].encode("utf8"),
                           str(rec["timestamp_publish"])))
Beispiel #5
0
    def go(self):
        soup = self.get(self.url)
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):
            rec = {
                "news_id": article['id'],
                "image_url": "",
                "timestamp_publish": parser.parse(article.time["datetime"]),
                "site": "berniesanders.com",
                "lang": "en",
                "title": self.html.unescape(article.h2.text),
                "news_category": self.html.unescape(article.h1.string.strip()),
                "url": article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            # Pull excerpt if available
            try:
                rec["excerpt_html"] = str(article.p)
                rec["excerpt"] = self.html.unescape(article.p.text)
            except AttributeError:
                rec["excerpt"], rec["excerpt_html"] = "", ""

            # Determine Type
            if rec['news_category'].lower() in ["on the road", "news"]:
                rec['news_type'] = "News"
            elif rec['news_category'].lower() == "press release":
                rec['news_type'] = "PressRelease"
            else:
                rec['news_type'] = "Unknown"

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"], rec["body_html"] = text, text
                rec['news_type'] = "ExternalLink"
                rec["body_html_nostyle"] = ""
            elif text and html:
                rec["body"], rec["body_html"] = text, html

                no_style = self.remove_style(BeautifulSoup(html))
                rec["body_html_nostyle"] = "".join([str(p) for p in no_style.findAll("p")])

                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            rec['body_markdown'] = convert_markdown (rec['body_html'])

            msg = ""
            if self.news_provider.exists_by_news_id(rec["news_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.news_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(msg.format(
                rec["title"].encode("utf8"),
                str(rec["timestamp_publish"])
            ))
Beispiel #6
0
    def go(self):
        soup = self.get(self.url)
        try:
            lang = soup.html['lang']
        except KeyError as e:
            lang = 'en'
        content = soup.find("section", {"id": "content"})
        for article in content.findAll("article"):
            rec = {
                "news_id":
                article['id'],
                "body":
                "",
                "image_url":
                "",
                "timestamp_publish":
                self.choose_publish_date(article.time["datetime"]),
                "site":
                "berniesanders.com",
                "lang":
                lang,
                "title":
                self.html.unescape(article.h2.text),
                "news_category":
                self.html.unescape(article.h1.string.strip()),
                "url":
                article.h2.a["href"]
            }
            if article.img is not None:
                rec["image_url"] = article.img["src"]

            # Pull excerpt if available
            try:
                rec["excerpt"] = self.html.unescape(article.p.text)
            except AttributeError:
                rec["excerpt"] = ""

            # Determine Type
            if rec['news_category'].lower() in ["on the road", "news"]:
                rec['news_type'] = "News"
            elif rec['news_category'].lower() in [
                    "press release", "comunicados de prensa"
            ]:
                rec['news_type'] = "PressRelease"
            else:
                rec['news_type'] = "Unknown"

            text, html, image = self.retrieve_article(rec["url"])
            if text and not html:
                rec["body"], rec["body_markdown"] = text, text
                rec['news_type'] = "ExternalLink"
            elif text and html:
                rec["body"] = text
                rec['body_markdown'] = convert_markdown(html)
                try:
                    article["image_url"]
                except KeyError:
                    article["image_url"] = image

            msg = ""
            if self.news_provider.exists_by_news_id(rec["news_id"]):
                print "found"
            else:
                print "not found"
                msg = "Inserting '{0}', created {1}"
                result = self.news_provider.create(rec)
                self.push_provider.create_by_foreign_model(result)

            logging.info(
                msg.format(rec["title"].encode("utf8"),
                           str(rec["timestamp_publish"])))