Beispiel #1
0
def get_news_by_url(url):
    news = News()
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))

        #title
        title = soup.find("div", "pg-story-head md").find("h2").text
        news.set_title(title)

        #postTime
        author_posttime = soup.find("p", "dateline").text.replace("\n","").lower().replace("\t","").split("/")
        post_time = author_posttime[1].replace("pm", "").replace("am", "").strip()
        
        t_format = "%d %b %Y, %I:%M"
        post_time = datetime.strptime(post_time, t_format).isoformat()
        news.set_posttime(post_time)

        #author
        author = author_posttime[0]
        news.set_author(author)

        #url
        news.set_url(url)

        #date
        date = datetime.utcnow().isoformat()
        news.set_date(date)

        #source
        source = 'elfinancierocr'
        news.set_source(source)

        #content, encoding, id, country, labels
        paragraphs = soup.find("div", "pg-story-body mce").find_all('p')
        content = " ".join([unicode(p.text) for p in paragraphs])
        news.set_content(content)

        #encoding
        encoding = 'utf-8'
        news.set_encoding(encoding)

        news.news = message.add_embers_ids(news.news)

        return news.news
    except:
        log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0]))
        return None
Beispiel #2
0
def get_news_by_url(url):
    news = News()
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))

        # title
        title = soup.find_all("h1")[0].text
        news.set_title(title)

        # postTime
        post_time = soup.select('meta[name="REVISION_DATE"]')[0]["content"]
        t_format = "%a %b %d %H:%M:%S %Z %Y"
        post_time = datetime.strptime(post_time, t_format).isoformat()
        news.set_posttime(post_time)

        # author
        author = soup.select('meta[name="Author"]')[0]["content"]
        news.set_author(author)

        # url
        news.set_url(url)

        # date
        date = datetime.utcnow().isoformat()
        news.set_date(date)

        # source
        source = "lta_reuters"
        news.set_source(source)

        # content, encoding, id, country, labels
        paragraphs = soup.find(id="resizeableText").find_all("p")
        content = " ".join([unicode(p.text) for p in paragraphs])
        news.set_content(content)

        # encoding
        encoding = "utf-8"
        news.set_encoding(encoding)

        news.news = message.add_embers_ids(news.news)

        return news.news
    except:
        log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0]))
        return None