def profData(name): dm_data = ysearch.search("Professor " + name, count= 3 ) dmTb = db.create(name="prof", data= dm_data) ## Select the proper Domain Name from the Given Database Keys(), the Most HIT URL hitdic= {} for k in DATABASE.keys(): hitdic[k]= 0 #print "\nURL's returned: " for r in dmTb.rows: url= r["prof$url"] #print url ## for every domain saved, for k in DATABASE.keys(): lt= k.split(".") urllt= url.split(".") for e in lt[1:2]: if e in urllt: hitdic[k]+=1 ## Report the Domain with Maximum Hits domain= findMax(hitdic) return [name,domain]
def get(self): query = console.strfix(self.request.get("query")) count = int(console.strfix(self.request.get("count"))) offset = int(console.strfix(self.request.get("offset"))) data = ysearch.search(query,vertical="news",count=count,start=offset); images = db.create(data=data) serialized = simplejson.dumps(images.rows) self.response.out.write(serialized)
def whensearch(q): wt = db.select(udf=date_udf, data=ysearch.search(q, count=50)) dates = db.group(by=["year", "month"], key="count", reducer=lambda x,y: x+y, aas="total", table=wt) dates = db.where(lambda r: r["month"] is not None and r["year"] is not None, table=dates) dates = db.sort(key="total", table=dates) if len(dates.rows) > 0: top = dates.rows[0] return top["month"], top["year"] return "None", "None"
def get(self): query = console.strfix(self.request.get("query")) count = int(console.strfix(self.request.get("count"))) offset = int(console.strfix(self.request.get("offset"))) includeDelicious = console.strfix(self.request.get("includeDelicious")) search_results = ysearch.search(query,count=count,start=offset); web = db.create(data=search_results) if len(includeDelicious) == 4: dl = db.select(udfs.unnest_value, name="dl", url=u"http://feeds.delicious.com/rss/popular/%s" % query) web = db.join(overlap_predicate,[web,dl]) serialized = simplejson.dumps(web.rows) self.response.out.write(serialized)
def _download(self, start, count, sort=True): rows = [] length = 0 for i in xrange(0, count, 50): offset = i * 50 pos = start + offset num_results = min(count - offset, 50) data = ysearch.search(self.term,start=pos,count=num_results) length = data['ysearchresponse']['totalhits'] rows = rows + db.create(data=data).rows self.length = int(length) self._cache(start,rows,sort=sort) return rows
def get(self): rows = list() try: data = ysearch.search('',count=50, more={'sites':'kajian.net,ilmoe.com,radiorodja.com,radiomuslim.com,salafiyunpad.wordpress.com,problemamuslim.wordpress.com','style':'raw'}) results = db.create(data=data) results = db.sort(key="date",table=results) rows = results.rows except (DownloadError): pass template_values = { 'rows': rows, } path = os.path.join(os.path.dirname(__file__), 'newest.html') self.response.out.write(template.render(path, template_values))
def get(self, site = "", terms = "", subject = ""): tn = db.create("tn", data = ysearch.search(terms + " " + subject + " site:" + site, count = 4, more={"filter": "-p**n", "type": "html"})) results = [] for row in tn.rows: url = row["tn$url"] match = re.match("http://([^/]*)", url) if match is not None: title = un_unicode_string(row["tn$title"]) abstract = un_unicode_string(row["tn$abstract"]) result = Result(title, abstract, match.group(1), url) results.append(result) encoder = ResultJsonEncoder() self.response.out.write(encoder.encode(results))
def PubSearch(name,domain): pub= ysearch.search(name + " publications site:" + domain, count= 2, more={'type':'html','abstract':'long'}) pub = db.create(name="pub", data= pub) ## for publication page extracted, yrs= ['2005','2006','2007','2008','2009','2010','2011'] dic= {} for y in yrs: dic[y]= 0 for r in pub.rows: url = r["pub$url"] print dic= reportPubsYrs(url,dic) return dic
def wwwsearch(q): qm = QueryMiner(q) def phrases_udf(r): r.update({"phrases": qm.extract(r)}) ; return r pc = defaultdict(lambda: 1) def pc_udf(r): for p in r["phrases"]: pc[p] += 1 w = db.select(udf=phrases_udf, data=ysearch.search(q, count=50)) db.select(udf=pc_udf, table=w) if len(pc) > 0: return sorted(pc.iteritems(), key=itemgetter(1), reverse=True)[0][0] return "None"
def getHomePage(name,domain): ## Trucate the domain to remove www. leading dm= domain.split(".") domainsplt= dm[1] for dstr in dm[2:]: domainsplt+= "."+dstr domain= domainsplt hm= ysearch.search("professor " + name + " site:"+domain, count= 1, more={'type':'html'}) hm = db.create(name="hm", data= hm) ## assume single home-page for r in hm.rows: url = r["hm$url"] return url return None
def get(self): query = self.request.get('q') if query == '': query = "nikah" page = self.request.get('page') if page == '': page = 1 rows = list() prev = None next = None if query != '': try: data = ysearch.search(query, count=self.count, start = (int(page)-1)*10, more= {'sites':'kajian.net,ilmoe.com,radiorodja.com,radiomuslim.com,salafiyunpad.wordpress.com,problemamuslim.wordpress.com'}) results = db.create(data=data) rows = results.rows try: prev = data['ysearchresponse']['prevpage'] prev = int(page)-1 except (KeyError): pass try: next = data['ysearchresponse']['nextpage'] next = int(page)+1 except (KeyError): pass except (DownloadError): pass template_values = { 'rows': rows, 'query':query, 'prev':prev, 'next': next } path = os.path.join(os.path.dirname(__file__), 'index.html') self.response.out.write(template.render(path, template_values))
def get(self, site="", terms="", subject=""): tn = db.create("tn", data=ysearch.search(terms + " " + subject + " site:" + site, count=4, more={ "filter": "-p**n", "type": "html" })) results = [] for row in tn.rows: url = row["tn$url"] match = re.match("http://([^/]*)", url) if match is not None: title = un_unicode_string(row["tn$title"]) abstract = un_unicode_string(row["tn$abstract"]) result = Result(title, abstract, match.group(1), url) results.append(result) encoder = ResultJsonEncoder() self.response.out.write(encoder.encode(results))
def get(self): q = self.request.get("q") m = self.request.get("m") if q: start = self.request.get("p") query = q if m: query = m if start: result = search(query, count=10, start=int(start)) images = search(query, vertical="images", count=1, start=int(start), filter="yes") else: result = search(query, count=10) images = search(query, vertical="images", count=1, filter="yes") resultset_glue = glue(q) ysr = result['ysearchresponse'] if ysr.has_key('resultset_web'): results = ysr['resultset_web'] template_values = { 'query': q, 'totalhits': int(ysr['totalhits']) + int(ysr['deephits']), 'results': results, 'stats': memcache.get_stats() } if images: image_response = images['ysearchresponse'] if int(image_response['count']) > 0: template_values['image'] = image_response['resultset_images'][0] if resultset_glue: categories = [] if resultset_glue.has_key('glue') and resultset_glue['glue'].has_key('navbar'): navbars = resultset_glue['glue']['navbar'] if navbars: for navbar in navbars: if isinstance(navbar, DictType): if navbar.has_key('navEntry'): if navbar['type'] == 'disambiguation': navEntries = navbar['navEntry'] if isinstance(navEntries, DictType): categories.append(navEntries) else: for navEntry in navEntries: categories.append(navEntry) template_values['categories'] = categories if m: template_values['category'] = m.replace(" ", "%20") if start and int(start) != 0: template_values['start'] = start template_values['prev'] = int(start) - 10 template_values['next'] = int(start) + 10 else: template_values['next'] = 10 path = os.path.join(os.path.dirname(__file__), "search.html") self.response.out.write(template.render(path, template_values)) else: template_values = { 'query': q, } path = os.path.join(os.path.dirname(__file__), "empty.html") self.response.out.write(template.render(path, template_values)) else: self.redirect("/")
def wwwsearch(q): # ans = memcache.get(q) # if ans: # return ans q=q.replace("'","") query = db.GqlQuery("SELECT * FROM QuestionAnswers WHERE question = '"+q+"'") result = query.get() if result: return result.answer qm = QueryMiner(q) def phrases_udf(r): r.update({"phrases": qm.extract(r)}) ; return r pc = defaultdict(lambda: 1) def pc_udf(r): for p in r["phrases"]: pc[p] += 1 w = yql.db.select(udf=phrases_udf, data=ysearch.search(q, count=50)) yql.db.select(udf=pc_udf, table=w) if len(pc) <= 0: return "Not Found" items= sorted(pc.iteritems(), key=itemgetter(1), reverse=True) if len(pc) > 0: ans= str((sorted(pc.iteritems(), key=itemgetter(1), reverse=True)[0][0]).encode('latin-1','ignore')) else: ans= "Not found" #memcache.add(key=q, value=ans) answer = QuestionAnswers(question=q, answer=ans) answer.put() return ans ##################### DONE ################ index=-1 topresults=[] for item in items: index=index+1 inner=-1 if index > 1: break count=item[1] base= str((item[0]).encode('latin-1','ignore')) for phr in items: inner=inner+1 if inner != index: text = str((phr[0]).encode('latin-1','ignore')) #print "text:" + text +" base: "+base if text.rfind(base) > -1: # print "Found"+base count=count+phr[1] topresults.append(count) # print topresults indexc=0 max=0 maxi=0 for result in topresults: if result > max: max = result maxi= indexc # print str(result)+" "+str(index) indexc = indexc + 1 # print "yes" # print str(items[maxi][0]) + " oldcount: "+ str(items[maxi][1]) + "newcount: " + str(max) + "index" + str(maxi) indexv=0 max1=0 maxi1=0 for result in topresults: if indexv != maxi: if result > max1: max1 = result maxi1= indexv indexv = indexv + 1 # print "yes" # print str(items[maxi1][0]) + " oldcount: "+ str(items[maxi1][1]) + "newcount: " + str(max1) + "index" + str(maxi1) ans=str((items[maxi][0]).encode('latin-1','ignore')) #+ ", " + str(repr(items[maxi1][0])) # if len(pc) > 0: # ans= str(sorted(pc.iteritems(), key=itemgetter(1), reverse=True)[0][0]) # else: # ans= "Not found" # memcache.add(key=q, value=ans) answer = Answers(question=q, answer=ans) answer.put() return ans
def imgsearch(q,start=0,ques=""): image_results=[] image_results1=[] image_results2=[] count=0 count1=0 count2=0 if ques!="": q2=text.mynorm(ques) if q2: ques=q2 images2 = ysearch.search(ques, vertical="images", count=18, start=(int(start)/3)) if images2: image_response2 = images2['bossresponse'] count2 = int(image_response2['images']['totalresults']) # + int(image_response['deephits']) if count2 > 0: image_results2 = image_response2['images']['results'] images1 = ysearch.search(q, vertical="images", count=18, start=(int(start)*2/3)) if images1: image_response1 = images1['bossresponse'] count1 = int(image_response1['images']['totalresults']) # + int(image_response['deephits']) if count1 > 0: image_results1 = image_response1['images']['results'] count=count1 + count2 c1=len(image_results1) c2=len(image_results2) if c1>=12 and c2>=6: #both many image_results = image_results2[:6] + image_results1[:12] else: if c1 >=12 and c2 > 0: #more c1 newc1=18-c2 image_results = image_results1[:newc1] + image_results2[:c2] elif c2 >=6 and c1 > 0: #more c2 newc2=18-c1 image_results = image_results1[:c1] + image_results2[:newc2] elif c1<=12 and c2<=6 and c1>0 and c2>0: #both less image_results = image_results2[:6] + image_results1[:12] elif c1>0: newstart=int(start)-count2 if int(newstart)<0: newstart=0 images1 = ysearch.search(q, vertical="images", count=18, start=int(newstart)) if images1: image_response1 = images1['bossresponse'] count = int(image_response1['images']['totalresults']) # + int(image_response['deephits']) if count>0: image_results = image_response1['images']['results'] elif c2>0: newstart=int(start)-count1 if int(newstart)<0: newstart=0 images1 = ysearch.search(ques, vertical="images", count=18, start=int(newstart)) if images1: image_response1 = images1['bossresponse'] count = int(image_response1['images']['totalresults']) # + int(image_response['deephits']) if count>0: image_results = image_response1['images']['results'] else: if q!="": q2=text.mynorm(q) if q2: q=q2 images1 = ysearch.search(q, vertical="images", count=18, start=int(start)) if images1: image_response1 = images1['bossresponse'] count = int(image_response1['images']['totalresults']) # + int(image_response['deephits']) if count>0: image_results = image_response1['images']['results'] random.seed(2) random.shuffle(image_results) if image_results: # image_results = image_response1['resultset_images'] + image_response2['resultset_images'] image = "<table width=\"1000\" border=0 ><tr><font size='2'>" if int(count) > 0: i=0 size=0 for images in image_results: if i > 0: if i % 6 == 0: image = image + "</tr><tr>" i+=1 intSize=0 try: intSize = float(images['size']) except ValueError: size=images['size'] if intSize > 1024: intSize = intSize / 1024 size = str(int(intSize)) + "K" elif intSize>0: size = str(int(intSize)) + "B" name=images['title'] if len(name) > 17: name=name[:15]+"..." domain=images['refererurl'] if len(domain) > 20: domain=domain[:18]+"..." image = image +"<td><table><tr><td height='160px' style='vertical-align:bottom;'><a href='"+images['refererclickurl']+"'><img title='"+images['refererurl']+"'src='"+images['thumbnailurl']+"' style='max-height:150px;max-width:150px;'></a></td></tr><tr><td><center><small>"+name+"</small></center></td></tr><tr><td><center><font color='#444444'><small>"+images['width']+" X "+ images['height']+" | " + size + "</font></small></center></td></tr><tr><td><center><small><font color='#003399'>"+domain+"</font></small></center></td></tr></table></td> " image = image + "</font</tr></table>" return image,count else: return "<br>No results found",0 return "<br>No results found",0
# See accompanying LICENSE file or http://www.opensource.org/licenses/BSD-3-Clause for the specific language governing permissions and limitations under the License. """ Search yahoo news and twitter for facebook Combine results with techmeme feeds based on titles having at least 2 term overlap Print results to stdout """ __author__ = "BOSS Team" from util import console, text from yos.yql import db, udfs from yos.boss import ysearch gn = db.create(name="gn", data=ysearch.search("facebook", bucket="news", count=40)) gn.rename("headline", "title") sm = db.create(name="sm", url="http://search.twitter.com/search.json?q=facebook&rpp=40") sm.rename("text", "title") tm = db.select(name="tm", udf=udfs.unnest_value, url="http://techmeme.com/firehose.xml") def overlap(r1, r2): return text.overlap(r1["title"], r2["title"]) > 1 j = db.join(overlap, [gn, sm, tm]) j = db.sort(key="sm$id", table=j) for r in j.rows: console.write( "\n%s\n[yahoo] %s\n[twitter] %s\n[techmeme] %s\n" % (r["sm$created_at"], r["gn$title"], r["sm$title"], r["tm$title"]) )
""" Search 'iphone' on yahoo news and sort by date Get the wikipedia edits for the iphone page Rank the news results based on their title/text overlap with the wikipedia entries Sort by the overlap sizes This could potentially be a new freshness model, based on the idea that wikipedia is updated for recent significance """ __author__ = "BOSS Team" from util import console, text from yos.boss import ysearch from yos.yql import db yn = db.create(name="yn", data=ysearch.search("iphone sdk", bucket="news", count=50)) wiki = db.create(name="wiki", url="http://en.wikipedia.org/w/index.php?title=IPhone_OS&feed=atom&action=history") tb = db.cross([yn, wiki]) def rankf(row): row.update( {"rank": text.overlap(row["yn$abstract"], row["wiki$summary"]["value"])} ) ; return row tb = db.select(udf=rankf, table=tb) tb = db.group(by=["yn$title"], key="rank", reducer=lambda d1,d2: d1+d2, as="total", table=tb, norm=text.norm) tb = db.sort(key="total", table=tb) print "Before\n" for r in yn.rows: console.write( "[news] %s\n" % r["yn$title"] )
def get(self, topic = "", subject=""): #Use the query string parameters if present. topic = self.request.get("topic", topic) formatter = Url.UrlFormat() topic = formatter.removeXSS(topic) if topic == "" or topic is None: topic = "Paul Kinlan" topic = urllib.unquote_plus(topic) subject = self.request.get("subject", subject) if subject == "" or subject is None: subject = topic subject = urllib.unquote_plus(subject) topicSearch = Model.TopicSearch.gql("WHERE topic = :1", topic.lower()).get() if topicSearch is None: topicSearch = Model.TopicSearch(topic = topic.lower()) topicSearch.topicCount = topicSearch.topicCount + 1 topicSearch.put() subjectSearch = Model.SubjectSearch.gql("WHERE subject = :1", subject.lower()).get() if subjectSearch is None: subjectSearch = Model.SubjectSearch(subject = subject.lower()) if subjectSearch.sitesData is None: tn = db.create(name="tn", data=ysearch.search(subject, vertical="web", count=10, more={"filter": "-p**n", "type": "html"})) subjectSearch.sitesData = tn.dumps() else: tn = db.create(name = "tn", data = db.simplejson.loads(subjectSearch.sitesData)) subjectSearch.subjectCount = subjectSearch.subjectCount + 1 subjectSearch.put() subjectTopicSearch = Model.TopicSubjectSearch.gql("WHERE topic = :1 AND subject = :2", topicSearch, subjectSearch).get() if subjectTopicSearch is None: subjectTopicSearch = Model.TopicSubjectSearch(topic = topicSearch, subject = subjectSearch) subjectTopicSearch.count = subjectTopicSearch.count + 1 subjectTopicSearch.put() results = { 0 : [], 1: []} urls = {} column = 0 for row in tn.rows: url = row["tn$url"] match = re.match("http://([^/]*)", url) if match is not None: domain = match.group(1) title = un_unicode_string(row["tn$title"]) abstract = un_unicode_string(row["tn$abstract"]) result = Result(title, abstract, domain, url) urls[domain] = result for result in urls: results[column % 2].append(urls[result]) column = column + 1 path = os.path.join(os.path.dirname(__file__), 'results.tmpl') self.response.out.write(template.render(path, {'decoded_query': topic, "decoded_subject": subject, 'type': topicSearch.type, 'query': topic, 'subject': subject , 'urls1': results[0], 'urls2': results[1]}))
def get(self, topic="", subject=""): #Use the query string parameters if present. topic = self.request.get("topic", topic) formatter = Url.UrlFormat() topic = formatter.removeXSS(topic) if topic == "" or topic is None: topic = "Paul Kinlan" topic = urllib.unquote_plus(topic) subject = self.request.get("subject", subject) if subject == "" or subject is None: subject = topic subject = urllib.unquote_plus(subject) topicSearch = Model.TopicSearch.gql("WHERE topic = :1", topic.lower()).get() if topicSearch is None: topicSearch = Model.TopicSearch(topic=topic.lower()) topicSearch.topicCount = topicSearch.topicCount + 1 topicSearch.put() subjectSearch = Model.SubjectSearch.gql("WHERE subject = :1", subject.lower()).get() if subjectSearch is None: subjectSearch = Model.SubjectSearch(subject=subject.lower()) if subjectSearch.sitesData is None: tn = db.create(name="tn", data=ysearch.search(subject, vertical="web", count=10, more={ "filter": "-p**n", "type": "html" })) subjectSearch.sitesData = tn.dumps() else: tn = db.create(name="tn", data=db.simplejson.loads(subjectSearch.sitesData)) subjectSearch.subjectCount = subjectSearch.subjectCount + 1 subjectSearch.put() subjectTopicSearch = Model.TopicSubjectSearch.gql( "WHERE topic = :1 AND subject = :2", topicSearch, subjectSearch).get() if subjectTopicSearch is None: subjectTopicSearch = Model.TopicSubjectSearch( topic=topicSearch, subject=subjectSearch) subjectTopicSearch.count = subjectTopicSearch.count + 1 subjectTopicSearch.put() results = {0: [], 1: []} urls = {} column = 0 for row in tn.rows: url = row["tn$url"] match = re.match("http://([^/]*)", url) if match is not None: domain = match.group(1) title = un_unicode_string(row["tn$title"]) abstract = un_unicode_string(row["tn$abstract"]) result = Result(title, abstract, domain, url) urls[domain] = result for result in urls: results[column % 2].append(urls[result]) column = column + 1 path = os.path.join(os.path.dirname(__file__), 'results.tmpl') self.response.out.write( template.render( path, { 'decoded_query': topic, "decoded_subject": subject, 'type': topicSearch.type, 'query': topic, 'subject': subject, 'urls1': results[0], 'urls2': results[1] }))
for m in ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sept", "oct", "nov", "dec"]: if s.startswith(m): return m def parse_month(s): months = filter(lambda m: m is not None, map(month_lookup, text.uniques(s))) if len(months) > 0: return text.norm(months[0]) def parse_year(s): years = filter(lambda t: len(t) == 4 and t.startswith("19") or t.startswith("200"), text.uniques(s)) if len(years) > 0: return text.norm(years[0]) def date_udf(r): return {"year": parse_year(r["abstract"]), "month": parse_month(r["abstract"]), "count": 1} # since max fetch size in v1 is 50, let's do two calls and increment start to get the first 100 results i1 = db.select(name="i1", udf=date_udf, data=ysearch.search("when was jfk assasinated", count=50)) i2 = db.select(name="i2", udf=date_udf, data=ysearch.search("when was jfk assasinated", start=50, count=50)) iraq = db.union(name="iraq", tables=[i1, i2]) dates = db.group(by=["iraq$year", "iraq$month"], key="iraq$count", reducer=lambda d1,d2: d1+d2, as="total", table=iraq) dates = db.sort(key="total", table=dates) for row in dates.rows: month = row["iraq$month"] year = row["iraq$year"] if month is not None and year is not None: console.write( "Month: %s\tYear: %s\tTotal: %d\n" % (month, year, row["total"]) )
def yahoo_search(vars): from yos.boss import ysearch from yos.yql import db data = ysearch.search(vars['query'], count=10) table = db.create(data=data) return table.rows
def yahoo_search(vars): from yos.boss import ysearch from yos.yql import db data = ysearch.search(vars['query'],count=10) table = db.create(data=data) return table.rows