def CheckPopularity(ip): # DUH cache_path = os.path.join(CACHE_DIR, ip) + '.pickle.pickle' if os.path.exists(cache_path): f = open(cache_path) results = pickle.load(f) else: print "miss: %s" % ip try: query = '"%s" %s' % (ip, QUERY_MODIFIERS) srch = WebSearch(APP_ID, query=query, results=50) results = srch.parse_results() pf = open(cache_path, 'w') pickle.dump(results.results, pf) pf.close() except yahoo.search.SearchError: print "%s failed" % (ip) return [] use_results = [] for result in results: reject = False for regexp in BANNED_URL_KEYWORDS: if re.search(regexp, result['Url'], re.I): reject = True if not reject: use_results.append(result) return use_results
def CheckPopularity(ip): # DUH cache_path = os.path.join(CACHE_DIR, ip) + '.pickle.pickle' if os.path.exists(cache_path): f = open(cache_path) results = pickle.load(f) else: print "miss: %s" % ip try: query = '"%s" %s' % (ip, QUERY_MODIFIERS) srch = WebSearch(APP_ID, query=query, results=50) results = srch.parse_results() pf = open(cache_path, 'w') pickle.dump(results.results, pf) pf.close() except yahoo.search.SearchError: print "%s failed" % (ip) return [] use_results = [] for result in results: reject = False for regexp in BANNED_URL_KEYWORDS: if re.search(regexp, result['Url'], re.I): reject = True if not reject: use_results.append(result) return use_results
def queryYahoo(self, query): from yahoo.search.web import WebSearch srch = WebSearch(config.yahoo_appid, query=query, results=self.count) dom = srch.get_results() results = srch.parse_results(dom) for res in results: url = res.Url yield url
def queryYahoo(self, query): from yahoo.search.web import WebSearch srch = WebSearch(config.yahoo_appid, query=query, results=self.count) dom = srch.get_results() results = srch.parse_results(dom) for res in results: url = res.Url yield url
def CheckPopularity(ip): cache_path = os.path.join(CACHE_DIR, ip) + '.pickle' if os.path.exists(cache_path): f = open(cache_path) return pickle.load(f) else: try: query = '"%s" %s' % (ip, QUERY_MODIFIERS) srch = WebSearch(APP_ID, query=query, results=50) results = srch.parse_results() pf = open(cache_path + '.pickle', 'w') pickle.dump(results.results, pf) pf.close() return results except yahoo.search.SearchError: print "%s failed" % (ip) return []
def CheckPopularity(ip): cache_path = os.path.join(CACHE_DIR, ip) + '.pickle' if os.path.exists(cache_path): f = open(cache_path) return pickle.load(f) else: try: query = '"%s" %s' % (ip, QUERY_MODIFIERS) srch = WebSearch(APP_ID, query=query, results=50) results = srch.parse_results() pf = open(cache_path + '.pickle', 'w') pickle.dump(results.results, pf) pf.close() return results except yahoo.search.SearchError: print "%s failed" % (ip) return []
class WebCorpus(object): ''' Builds a corpus from Yahoo snippets given several keywords. ''' def __init__(self): ''' Constructor ''' self.s = WebSearch(YAHOO_API) self.total_counts = 0 def search(self): dom = self.s.get_results() self.total_counts = self.total_counts + 1 return self.s.parse_results(dom) def get_results(self, query, start=START, pages=PAGES): self.s.query = query self.s.start = start self.s.results = pages return self.search() def get_count(self, results): return results.total_results_available def get_snippets(self, results): snippets = [(res['MimeType'], res.Url, res['Summary']) for res in results] return snippets def get_summaries(self, query, start=START, pages=PAGES): results = self.get_results(query, start=start, pages=pages) snippets = get_snippets(results) summaries = [snippets[2] for snippet in snippets] return summaries
def main(): if (len(sys.argv) != 4): usage() sys.exit(2) srch = WebSearch(app_id=sys.argv[1]) srch.query = sys.argv[2] # only the first 100 results are queryable srch.results = sys.argv[3] # Disable content filter to get all available results srch.adult_ok = 1 LinkIdx = 0 for res in srch.parse_results(): LinkIdx = LinkIdx + 1 print res.Url
cxy = bz2.compress(xybytes) n = (len(cxy) - len(cy)) / float(len(cx)) return n def printSortedDict(adict): keys = adict.keys() keys.sort() for k in keys: print k print adict[k]['Title'] print adict[k]['Url'] print adict[k]['Summary'] print " " app_id = "NCD-Probe-Demo" srch = WebSearch(app_id, language='en') srch.query = "Radisson" srch.results = 50 dom = srch.get_results() results = srch.parse_results(dom) ranked = {} for res in results: # strip out search word from summary summary = str(res['Summary']) stripped_summary = summary.replace('Radisson', '') distance = ncd_probe(xbytes, cx, stripped_summary) dstr = 'NCD: ' + str(distance) ranked[dstr] = res
def __init__(self): ''' Constructor ''' self.s = WebSearch(YAHOO_API) self.total_counts = 0
return n def printSortedDict(adict): keys = adict.keys() keys.sort() for k in keys: print k print adict[k]['Title'] print adict[k]['Url'] print adict[k]['Summary'] print " " app_id = "NCD-Probe-Demo" srch = WebSearch(app_id, language='en') srch.query = "Radisson" srch.results = 50 dom = srch.get_results() results = srch.parse_results(dom) ranked = {} for res in results: # strip out search word from summary summary = str(res['Summary']) stripped_summary = summary.replace('Radisson', '') distance = ncd_probe(xbytes, cx, stripped_summary) dstr = 'NCD: ' + str(distance) ranked[dstr] = res