def feeds_to_trends(feeds): for url in feeds: url = url['feed_url'] news = {} try: for story in Newsfeed().search(url, cached=False): d, s = datetext(story.date, story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate # content. news.setdefault(d, {})[hash(s)] = s m = Model() for date, stories in news.items(): s = stories.values() s = ' '.join(s).lower() # Each day of news is a single document. # By adding all documents to a model we can calculate tf-idf. m.append(Document(s, stemmer=LEMMA, exclude=[ 'news', 'day'], name=date)) for document in m: print document.name print document.keywords(top=10) except HTTP404NotFound: print url pass
def article_titles(feeds): titles = {} for key in feeds: titles[key] = [] newsfeed = Newsfeed().search(feeds[key]) for result in newsfeed: titles[key].append(result.title) return titles
import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.web import Newsfeed, plaintext, URL from pattern.table import date # This example reads a given RSS or Atom newsfeed channel. # Some sample newsfeeds to try out: NATURE = "http://www.nature.com/nature/current_issue/rss/index.html" SCIENCE = "http://www.sciencemag.org/rss/podcast.xml" HERALD = "http://www.iht.com/rss/frontpage.xml" TIME = "http://feeds.feedburner.com/time/topstories" CNN = "http://rss.cnn.com/rss/edition.rss" engine = Newsfeed() for result in engine.search(CNN, cached=True): print result.title.upper() print plaintext(result.description) # Remove HTML formatting. print result.url print result.date print # Newsfeed item URL's lead to the page with the full article. # Since this page can have any kind of formatting, there is no default way to read it, # but we can simply download the source HTML and convert it to plain text: #html = URL(result.url).download() #print plaintext(html) # The resulting text can contain a lot of garbage. # An better way to do this is to use a DOM parser and select the HTML elements we want.
import os, sys; sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.web import Newsfeed, plaintext, URL from pattern.table import date # This example reads a given RSS or Atom newsfeed channel. # Some sample newsfeeds to try out: NATURE = "http://www.nature.com/nature/current_issue/rss/index.html" SCIENCE = "http://www.sciencemag.org/rss/podcast.xml" HERALD = "http://www.iht.com/rss/frontpage.xml" TIME = "http://feeds.feedburner.com/time/topstories" CNN = "http://rss.cnn.com/rss/edition.rss" engine = Newsfeed() for result in engine.search(CNN, cached=True): print result.title.upper() print plaintext(result.description) # Remove HTML formatting. print result.url print result.date print # Newsfeed item URL's lead to the page with the full article. # Since this page can have any kind of formatting, there is no default way to read it, # but we can simply download the source HTML and convert it to plain text: #html = URL(result.url).download() #print plaintext(html) # The resulting text can contain a lot of garbage. # An better way to do this is to use a DOM parser and select the HTML elements we want. # This is demonstrated in the next example.
'geeky': 'http://feeds.feedburner.com/daily-star-Tech', 'dubious': 'http://feeds.feedburner.com/daily-star-Weird-News', 'vulgar': 'http://feeds.feedburner.com/daily-star-Love-Sex', } PATH = pd('..', 'data', 'news2.csv') # pd = parent directory of this script try: csv = Datasheet.load(PATH) seen = set(csv.columns[0]) except: csv = Datasheet() seen = set() for genre, url in feeds.items(): for r in Newsfeed().search(url, cached=False): if r.url not in seen: print r.title print try: src = URL(r.url).download(cached=True) dom = DOM(src) txt = [] # Daily Star has untidy HTML markup. # Collect the article <p> by <p>. for p in dom('.story-content p'): if p.parent.tag == 'blockquote': continue s = plaintext(p) s = s.strip()
(-1, 'ejbron.wordpress.com'): 'https://ejbron.wordpress.com/feed/' } PATH = pd('news.csv') try: csv = Datasheet.load(PATH) seen = set(csv.columns[-2]) # use url as id except: csv = Datasheet() seen = set() for (label, name), url in sources.items(): try: f = Newsfeed() f = f.search(url, cached=False) except: continue for r in f: # 1) Download source & parse the HTML tree: try: src = URL(r.url).download(cached=True) dom = DOM(src) except Exception as e: continue # 2) Find article text w/ CSS selectors: for selector in (
#!/usr/bin/env python # -*- coding: utf-8 -*- from flask import Flask from flask import render_template from pattern.web import Newsfeed, plaintext from alchemyapi import AlchemyAPI app = Flask(__name__) reader = Newsfeed() alchemyapi = AlchemyAPI() RSS_LIST = [ (u"Lifehacker", "http://feeds.gawker.com/lifehacker/vip"), (u"The Verge", "http://www.theverge.com/rss/index.xml"), (u"Naukas", "http://naukas.com/feed/"), (u"Zen Habits", "http://feeds.feedburner.com/zenhabits?format=xml"), (u"Yuri", "http://www.lapizarradeyuri.com/feed/"), (u"Menéame", "http://www.meneame.net/rss") ] items = [] for feed in RSS_LIST: feedlist = [] for result in reader.search(feed[1])[:10]: clean_text = plaintext(result.text) response = alchemyapi.entities("text", result.text) entities = [] for entity in response["entities"]:
# techniques like Artificial Intelligence (AI), Machine Learning (ML), mathematical functions, and # statistical algorithms. # Pattern is a web mining module for the Python programming language. # It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural # language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning # (vector space model, clustering, SVM), network analysis and <canvas> visualization. # Web mining. # A simple web mining technique. from pattern.web import Newsfeed, plaintext from pattern.db import date from pattern.vector import Model, Document, LEMMA news, url = {}, 'http://news.google.com/news?output=rss' for story in Newsfeed().search(url, cached=False): d = str(date(story.date, format='%Y-%m-%d')) s = plaintext(story.description) # Each key in the news dictionary is a date: news is grouped per day. # Each value is a dictionary of id => story items. # We use hash(story.description) as a unique id to avoid duplicate content. news.setdefault(d, {})[hash(s)] = s # Your code will probably have some preprocessing steps to save and load the mined news updates. m = Model() for date, stories in news.items():
import os, sys; sys.path.append(os.path.join("..", "..", "..")) from pattern.web import Newsfeed, plaintext, URL from pattern.table import date wsj = "http://online.wsj.com/xml/rss/3_7014.xml" engine = Newsfeed() for result in engine.search(wsj, cached=True): print result.title.upper() print plaintext(result.description) # Remove HTML formatting. print result.url print result.date print