Esempio n. 1
0
from yos.yql import db
from yos.boss import ysearch

ynews_data = ysearch.search_v1("google android", vertical="news", count=100, more={"news.ranking": "date"})
ynews = db.create(name="ynews", data=ynews_data)
ynews.rename(before="headline", after="title")

sm = db.create(name="sm", url="http://summize.com/search.json?q=google+android&rpp=60&lang=en")
sm.rename(before="text", after="title")

ytf = lambda r: {"title": r["title"]["value"], "favorites": int(r["statistics"]["favoriteCount"])}
yt = db.select(name="yt", udf=ytf, url="http://gdata.youtube.com/feeds/api/videos?vq=google+android&lr=en&orderby=published")

diggf = lambda r: {"title": r["title"]["value"], "diggs": int(r["diggCount"]["value"])}
digg = db.select(name="dg", udf=diggf, url="http://digg.com/rss_search?search=google+android&area=dig&type=both&section=news")

def overlap_predicate(r1, r2):
  return text.overlap(r1["title"], r2["title"]) > 2

tb = db.join(overlap_predicate, [ynews, sm, digg, yt])

def socialf(row):
  row.update({"social": row["dg$diggs"] + row["yt$favorites"]}) ; return row

tb = db.select(udf=socialf, table=tb)
tb = db.group(by=["ynews$title"], key="social", reducer=lambda d1,d2: d1+d2, as="rank", table=tb, norm=text.norm)
tb = db.sort(key="rank", table=tb)

for r in tb.rows:
  console.write( "\n%s\n[y] %s\n[t] %s\n[sr] %d\n" % (r["sm$created_at"], r["ynews$title"], r["sm$title"], r["rank"]) )
Esempio n. 2
0
Sort by the overlap sizes
This could potentially be a new freshness model, based on the idea that wikipedia is updated for recent significance
"""

__author__ = "BOSS Team"

from util import console, text
from yos.boss import ysearch
from yos.yql import db

yn = db.create(name="yn", data=ysearch.search("iphone sdk", bucket="news", count=50))
wiki = db.create(name="wiki", url="http://en.wikipedia.org/w/index.php?title=IPhone_OS&feed=atom&action=history")

tb = db.cross([yn, wiki])

def rankf(row):
  row.update( {"rank": text.overlap(row["yn$abstract"], row["wiki$summary"]["value"])} ) ; return row

tb = db.select(udf=rankf, table=tb)
tb = db.group(by=["yn$title"], key="rank", reducer=lambda d1,d2: d1+d2, as="total", table=tb, norm=text.norm)
tb = db.sort(key="total", table=tb)

print "Before\n"
for r in yn.rows:
  console.write( "[news] %s\n" % r["yn$title"] )

print "After\n"
for r in tb.rows:
  console.write( "[news] %s\n[source] %s\t[rank] %d\n" % (r["yn$title"], r["yn$source"], r["total"]) )

Esempio n. 3
0

"""
Search yahoo news and twitter for facebook
Combine results with techmeme feeds based on titles having at least 2 term overlap
Print results to stdout
"""

__author__ = "BOSS Team"

from util import console, text
from yos.yql import db, udfs
from yos.boss import ysearch

gn = db.create(name="gn", data=ysearch.search_v1("facebook", vertical="news", count=40))
gn.rename("headline", "title")

sm = db.create(name="sm", url="http://search.twitter.com/search.json?q=facebook&rpp=40")
sm.rename("text", "title")

tm = db.select(name="tm", udf=udfs.unnest_value, url="http://techmeme.com/firehose.xml")

def overlap(r1, r2):
  return text.overlap(r1["title"], r2["title"]) > 1

j = db.join(overlap, [gn, sm, tm])
j = db.sort(key="sm$id", table=j)

for r in j.rows:
  console.write( "\n%s\n[yahoo] %s\n[twitter] %s\n[techmeme] %s\n" % (r["sm$created_at"], r["gn$title"], r["sm$title"], r["tm$title"]) )
Esempio n. 4
0
  for m in ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sept", "oct", "nov", "dec"]:
    if s.startswith(m):
      return m

def parse_month(s):
  months = filter(lambda m: m is not None, map(month_lookup, text.uniques(s)))
  if len(months) > 0:
    return text.norm(months[0])

def parse_year(s):
  years = filter(lambda t: len(t) == 4 and t.startswith("19") or t.startswith("200"), text.uniques(s))
  if len(years) > 0:
    return text.norm(years[0])

def date_udf(r):
  return {"year": parse_year(r["abstract"]), "month": parse_month(r["abstract"]), "count": 1}

# since max fetch size in v1 is 50, let's do two calls and increment start to get the first 100 results
i1 = db.select(name="i1", udf=date_udf, data=ysearch.search("when was jfk assasinated", count=50))
i2 = db.select(name="i2", udf=date_udf, data=ysearch.search("when was jfk assasinated", start=50, count=50))

iraq = db.union(name="iraq", tables=[i1, i2])
dates = db.group(by=["iraq$year", "iraq$month"], key="iraq$count", reducer=lambda d1,d2: d1+d2, as="total", table=iraq)
dates = db.sort(key="total", table=dates)

for row in dates.rows:
  month = row["iraq$month"]
  year = row["iraq$year"]
  if month is not None and year is not None:
    console.write( "Month: %s\tYear: %s\tTotal: %d\n" % (month, year, row["total"]) )