def CheckPopularity(ip):
  # DUH
  cache_path = os.path.join(CACHE_DIR, ip) + '.pickle.pickle'
  if os.path.exists(cache_path):
    f = open(cache_path)
    results = pickle.load(f)
  else:
    print "miss: %s" % ip
    try:
      query = '"%s" %s' % (ip, QUERY_MODIFIERS)
      srch = WebSearch(APP_ID, query=query, results=50)
      results = srch.parse_results()
      pf = open(cache_path, 'w')
      pickle.dump(results.results, pf)
      pf.close()
    except yahoo.search.SearchError:
      print "%s failed" % (ip)
      return []

  use_results = []
  for result in results:
    reject = False
    for regexp in BANNED_URL_KEYWORDS:
      if re.search(regexp, result['Url'], re.I):
        reject = True
    if not reject:
      use_results.append(result)
  return use_results
def CheckPopularity(ip):
    # DUH
    cache_path = os.path.join(CACHE_DIR, ip) + '.pickle.pickle'
    if os.path.exists(cache_path):
        f = open(cache_path)
        results = pickle.load(f)
    else:
        print "miss: %s" % ip
        try:
            query = '"%s" %s' % (ip, QUERY_MODIFIERS)
            srch = WebSearch(APP_ID, query=query, results=50)
            results = srch.parse_results()
            pf = open(cache_path, 'w')
            pickle.dump(results.results, pf)
            pf.close()
        except yahoo.search.SearchError:
            print "%s failed" % (ip)
            return []

    use_results = []
    for result in results:
        reject = False
        for regexp in BANNED_URL_KEYWORDS:
            if re.search(regexp, result['Url'], re.I):
                reject = True
        if not reject:
            use_results.append(result)
    return use_results
Esempio n. 3
0
 def queryYahoo(self, query):
     from yahoo.search.web import WebSearch
     srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
     dom = srch.get_results()
     results = srch.parse_results(dom)
     for res in results:
         url = res.Url
         yield url
Esempio n. 4
0
 def queryYahoo(self, query):
     from yahoo.search.web import WebSearch
     srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
     dom = srch.get_results()
     results = srch.parse_results(dom)
     for res in results:
         url = res.Url
         yield url
def CheckPopularity(ip):
  cache_path = os.path.join(CACHE_DIR, ip) + '.pickle'
  if os.path.exists(cache_path):
    f = open(cache_path)
    return pickle.load(f)
  else:
    try:
      query = '"%s" %s' % (ip, QUERY_MODIFIERS)      
      srch = WebSearch(APP_ID, query=query, results=50)
      results = srch.parse_results()
      pf = open(cache_path + '.pickle', 'w')
      pickle.dump(results.results, pf)
      pf.close()
      return results
    except yahoo.search.SearchError:
      print "%s failed" % (ip)
      return []
def CheckPopularity(ip):
    cache_path = os.path.join(CACHE_DIR, ip) + '.pickle'
    if os.path.exists(cache_path):
        f = open(cache_path)
        return pickle.load(f)
    else:
        try:
            query = '"%s" %s' % (ip, QUERY_MODIFIERS)
            srch = WebSearch(APP_ID, query=query, results=50)
            results = srch.parse_results()
            pf = open(cache_path + '.pickle', 'w')
            pickle.dump(results.results, pf)
            pf.close()
            return results
        except yahoo.search.SearchError:
            print "%s failed" % (ip)
            return []
Esempio n. 7
0
class WebCorpus(object):
    '''
    Builds a corpus from Yahoo snippets given several
    keywords. 
    '''
    def __init__(self):
        '''
        Constructor
        '''
        self.s = WebSearch(YAHOO_API)
        self.total_counts = 0
        
    def search(self):
        dom = self.s.get_results()
        self.total_counts = self.total_counts + 1
        return self.s.parse_results(dom)
    
    def get_results(self, query, start=START, pages=PAGES):
        self.s.query = query
        self.s.start = start
        self.s.results = pages
        return self.search()

    def get_count(self, results):
        return results.total_results_available

    def get_snippets(self, results):
        snippets = [(res['MimeType'], res.Url, res['Summary']) for res in results]
        return snippets
    
    def get_summaries(self, query, start=START, pages=PAGES):
        results = self.get_results(query, start=start, pages=pages)
        snippets = get_snippets(results)
        summaries = [snippets[2] for snippet in snippets]
        return summaries
    
    
        
        
        
Esempio n. 8
0
def main():

    if (len(sys.argv) != 4):
    	usage()
    	sys.exit(2)
    	
    srch = WebSearch(app_id=sys.argv[1])
    srch.query = sys.argv[2]
    # only the first 100 results are queryable
    srch.results = sys.argv[3]
    # Disable content filter to get all available results
    srch.adult_ok = 1
    LinkIdx = 0
	
    for res in srch.parse_results():
        LinkIdx = LinkIdx + 1
        print res.Url
    cxy = bz2.compress(xybytes)
    n = (len(cxy) - len(cy)) / float(len(cx))
    return n

def printSortedDict(adict):
    keys = adict.keys()
    keys.sort()
    for k in keys:
        print k
        print adict[k]['Title']
        print adict[k]['Url']
        print adict[k]['Summary']
        print " "

app_id = "NCD-Probe-Demo"
srch = WebSearch(app_id, language='en')
srch.query = "Radisson"
srch.results = 50

dom = srch.get_results()
results = srch.parse_results(dom)

ranked = {}
for res in results:
    # strip out search word from summary
    summary = str(res['Summary'])
    stripped_summary = summary.replace('Radisson', '')
    distance = ncd_probe(xbytes, cx, stripped_summary)
    dstr = 'NCD: ' + str(distance)
    ranked[dstr] = res
 
Esempio n. 10
0
 def __init__(self):
     '''
     Constructor
     '''
     self.s = WebSearch(YAHOO_API)
     self.total_counts = 0
    return n


def printSortedDict(adict):
    keys = adict.keys()
    keys.sort()
    for k in keys:
        print k
        print adict[k]['Title']
        print adict[k]['Url']
        print adict[k]['Summary']
        print " "


app_id = "NCD-Probe-Demo"
srch = WebSearch(app_id, language='en')
srch.query = "Radisson"
srch.results = 50

dom = srch.get_results()
results = srch.parse_results(dom)

ranked = {}
for res in results:
    # strip out search word from summary
    summary = str(res['Summary'])
    stripped_summary = summary.replace('Radisson', '')
    distance = ncd_probe(xbytes, cx, stripped_summary)
    dstr = 'NCD: ' + str(distance)
    ranked[dstr] = res