Esempio n. 1
0
 def queryYahoo(self, query):
     from yahoo.search.web import WebSearch
     srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
     dom = srch.get_results()
     results = srch.parse_results(dom)
     for res in results:
         url = res.Url
         yield url
Esempio n. 2
0
 def queryYahoo(self, query):
     from yahoo.search.web import WebSearch
     srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
     dom = srch.get_results()
     results = srch.parse_results(dom)
     for res in results:
         url = res.Url
         yield url
Esempio n. 3
0
class WebCorpus(object):
    '''
    Builds a corpus from Yahoo snippets given several
    keywords. 
    '''
    def __init__(self):
        '''
        Constructor
        '''
        self.s = WebSearch(YAHOO_API)
        self.total_counts = 0
        
    def search(self):
        dom = self.s.get_results()
        self.total_counts = self.total_counts + 1
        return self.s.parse_results(dom)
    
    def get_results(self, query, start=START, pages=PAGES):
        self.s.query = query
        self.s.start = start
        self.s.results = pages
        return self.search()

    def get_count(self, results):
        return results.total_results_available

    def get_snippets(self, results):
        snippets = [(res['MimeType'], res.Url, res['Summary']) for res in results]
        return snippets
    
    def get_summaries(self, query, start=START, pages=PAGES):
        results = self.get_results(query, start=start, pages=pages)
        snippets = get_snippets(results)
        summaries = [snippets[2] for snippet in snippets]
        return summaries
    
    
        
        
        
    return n

def printSortedDict(adict):
    keys = adict.keys()
    keys.sort()
    for k in keys:
        print k
        print adict[k]['Title']
        print adict[k]['Url']
        print adict[k]['Summary']
        print " "

app_id = "NCD-Probe-Demo"
srch = WebSearch(app_id, language='en')
srch.query = "Radisson"
srch.results = 50

dom = srch.get_results()
results = srch.parse_results(dom)

ranked = {}
for res in results:
    # strip out search word from summary
    summary = str(res['Summary'])
    stripped_summary = summary.replace('Radisson', '')
    distance = ncd_probe(xbytes, cx, stripped_summary)
    dstr = 'NCD: ' + str(distance)
    ranked[dstr] = res
 
printSortedDict(ranked)
def printSortedDict(adict):
    keys = adict.keys()
    keys.sort()
    for k in keys:
        print k
        print adict[k]['Title']
        print adict[k]['Url']
        print adict[k]['Summary']
        print " "


app_id = "NCD-Probe-Demo"
srch = WebSearch(app_id, language='en')
srch.query = "Radisson"
srch.results = 50

dom = srch.get_results()
results = srch.parse_results(dom)

ranked = {}
for res in results:
    # strip out search word from summary
    summary = str(res['Summary'])
    stripped_summary = summary.replace('Radisson', '')
    distance = ncd_probe(xbytes, cx, stripped_summary)
    dstr = 'NCD: ' + str(distance)
    ranked[dstr] = res

printSortedDict(ranked)