Esempio n. 1
0
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.web import Newsfeed, plaintext, URL
from pattern.table import date

# This example reads a given RSS or Atom newsfeed channel.
# Some sample newsfeeds to try out:
NATURE = "http://www.nature.com/nature/current_issue/rss/index.html"
SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
HERALD = "http://www.iht.com/rss/frontpage.xml"
TIME = "http://feeds.feedburner.com/time/topstories"
CNN = "http://rss.cnn.com/rss/edition.rss"

engine = Newsfeed()

for result in engine.search(CNN, cached=True):
    print result.title.upper()
    print plaintext(result.description)  # Remove HTML formatting.
    print result.url
    print result.date
    print

# Newsfeed item URL's lead to the page with the full article.
# Since this page can have any kind of formatting, there is no default way to read it,
# but we can simply download the source HTML and convert it to plain text:
#html = URL(result.url).download()
#print plaintext(html)

# The resulting text can contain a lot of garbage.
# An better way to do this is to use a DOM parser and select the HTML elements we want.
# This is demonstrated in the next example.
Esempio n. 2
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.web   import Newsfeed, plaintext, URL
from pattern.table import date

# This example reads a given RSS or Atom newsfeed channel.
# Some sample newsfeeds to try out:
NATURE  = "http://www.nature.com/nature/current_issue/rss/index.html"
SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
HERALD  = "http://www.iht.com/rss/frontpage.xml"
TIME    = "http://feeds.feedburner.com/time/topstories"
CNN     = "http://rss.cnn.com/rss/edition.rss"

engine = Newsfeed()

for result in engine.search(CNN, cached=True):
    print result.title.upper()
    print plaintext(result.description) # Remove HTML formatting.
    print result.url
    print result.date
    print

# Newsfeed item URL's lead to the page with the full article.
# Since this page can have any kind of formatting, there is no default way to read it,
# but we can simply download the source HTML and convert it to plain text:
#html = URL(result.url).download()
#print plaintext(html)

# The resulting text can contain a lot of garbage.
# An better way to do this is to use a DOM parser and select the HTML elements we want.
# This is demonstrated in the next example.
Esempio n. 3
0
alchemyapi = AlchemyAPI()

RSS_LIST = [
  (u"Lifehacker", "http://feeds.gawker.com/lifehacker/vip"),
  (u"The Verge", "http://www.theverge.com/rss/index.xml"),
  (u"Naukas", "http://naukas.com/feed/"),
  (u"Zen Habits", "http://feeds.feedburner.com/zenhabits?format=xml"),
  (u"Yuri", "http://www.lapizarradeyuri.com/feed/"),
  (u"Menéame", "http://www.meneame.net/rss")
]

items = []

for feed in RSS_LIST:
  feedlist = []
  for result in reader.search(feed[1])[:10]:
    clean_text = plaintext(result.text)
    response = alchemyapi.entities("text", result.text)

    entities = []
    for entity in response["entities"]:
      if entity.has_key("disambiguated"):
        dbpedia_uri = entity["disambiguated"]["dbpedia"]
      else:
        dbpedia_uri = None
      entities.append((entity["text"], dbpedia_uri))

    feedlist.append(dict(title=result.title, url=result.url, text=clean_text, entities=entities))
  items.append(dict(site=feed[0], feedlist=feedlist))

@app.route('/')
Esempio n. 4
0
    'https://ejbron.wordpress.com/feed/'
}

PATH = pd('news.csv')

try:
    csv = Datasheet.load(PATH)
    seen = set(csv.columns[-2])  # use url as id
except:
    csv = Datasheet()
    seen = set()

for (label, name), url in sources.items():
    try:
        f = Newsfeed()
        f = f.search(url, cached=False)
    except:
        continue

    for r in f:

        # 1) Download source & parse the HTML tree:
        try:
            src = URL(r.url).download(cached=True)
            dom = DOM(src)
        except Exception as e:
            continue

        # 2) Find article text w/ CSS selectors:
        for selector in (
                "article[class*='node-article']",  # The Hill
import os, sys; sys.path.append(os.path.join("..", "..", ".."))

from pattern.web   import Newsfeed, plaintext, URL
from pattern.table import date

wsj    = "http://online.wsj.com/xml/rss/3_7014.xml"

engine = Newsfeed()

for result in engine.search(wsj, cached=True):
    print result.title.upper()
    print plaintext(result.description) # Remove HTML formatting.
    print result.url
    print result.date
    print