def queryYahoo(self, query): from yahoo.search.web import WebSearch srch = WebSearch(config.yahoo_appid, query=query, results=self.count) dom = srch.get_results() results = srch.parse_results(dom) for res in results: url = res.Url yield url
class WebCorpus(object): ''' Builds a corpus from Yahoo snippets given several keywords. ''' def __init__(self): ''' Constructor ''' self.s = WebSearch(YAHOO_API) self.total_counts = 0 def search(self): dom = self.s.get_results() self.total_counts = self.total_counts + 1 return self.s.parse_results(dom) def get_results(self, query, start=START, pages=PAGES): self.s.query = query self.s.start = start self.s.results = pages return self.search() def get_count(self, results): return results.total_results_available def get_snippets(self, results): snippets = [(res['MimeType'], res.Url, res['Summary']) for res in results] return snippets def get_summaries(self, query, start=START, pages=PAGES): results = self.get_results(query, start=start, pages=pages) snippets = get_snippets(results) summaries = [snippets[2] for snippet in snippets] return summaries
return n def printSortedDict(adict): keys = adict.keys() keys.sort() for k in keys: print k print adict[k]['Title'] print adict[k]['Url'] print adict[k]['Summary'] print " " app_id = "NCD-Probe-Demo" srch = WebSearch(app_id, language='en') srch.query = "Radisson" srch.results = 50 dom = srch.get_results() results = srch.parse_results(dom) ranked = {} for res in results: # strip out search word from summary summary = str(res['Summary']) stripped_summary = summary.replace('Radisson', '') distance = ncd_probe(xbytes, cx, stripped_summary) dstr = 'NCD: ' + str(distance) ranked[dstr] = res printSortedDict(ranked)
def printSortedDict(adict): keys = adict.keys() keys.sort() for k in keys: print k print adict[k]['Title'] print adict[k]['Url'] print adict[k]['Summary'] print " " app_id = "NCD-Probe-Demo" srch = WebSearch(app_id, language='en') srch.query = "Radisson" srch.results = 50 dom = srch.get_results() results = srch.parse_results(dom) ranked = {} for res in results: # strip out search word from summary summary = str(res['Summary']) stripped_summary = summary.replace('Radisson', '') distance = ncd_probe(xbytes, cx, stripped_summary) dstr = 'NCD: ' + str(distance) ranked[dstr] = res printSortedDict(ranked)