def printResults(newRankList, uniqueList, googleResults, bingResults, googleURLs, bingURLs, ul, filename): for i in range(0, len(uniqueList)): result = struc() result.url = uniqueList[i] result.rank = ul[i] if uniqueList[i] in googleURLs: ind = googleURLs.index(uniqueList[i]) result.title = googleResults[ind].title result.description = googleResults[ind].description else: ind = bingURLs.index(uniqueList[i]) result.title = bingResults[ind].title result.description = bingResults[ind].description newRankList.append(result) newRankList.sort(key=operator.attrgetter('rank')) file = open(filename, "w") count = 0 for result in newRankList: count = count + 1 file.write( str(count) + ' ' + str(result.title) + ' ' + str(result.description) + '\n') return
def bing_scrape(query): bingResults = [] address = "https://www.bing.com/search?q=" + query + "&count=" + str( numresults) request = Request( address, None, { 'User-Agent': 'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11' }) urlfile = urlopen(request) page = urlfile.read() soup = BeautifulSoup(page, 'html5lib') titles = [] descriptions = [] urls = [] headers = soup.findAll('li', attrs={'class': 'b_algo'}) for header in headers: if header is not None: anchor = header.find('a') desc = header.find('p') if anchor is not None and desc is not None: t = anchor.contents[0].encode("utf-8") u = anchor['href'] urls.append(u) titles.append(t) d = desc.text.encode("utf-8") descriptions.append(d) size = len(titles) filename = query + '_Bing.txt' file = open(filename, "w") for i in range(0, size): result = struc() result.rank = i + 1 result.title = titles[i] result.description = descriptions[i] result.url = urls[i] file.write( str(i + 1) + ' ' + str(titles[i]) + ' ' + str(descriptions[i]) + '\n') bingResults.append(result) return bingResults
def yahoo_scrape(query): yahooResults = [] address = "https://search.yahoo.com/search?p=" + query + "&n=" + str( numresults) request = urllib2.Request( address, None, { 'User-Agent': 'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11' }) urlfile = urllib2.urlopen(request) page = urlfile.read() soup = BeautifulSoup(page) titles = [] descriptions = [] urls = [] headers = soup.findAll('div', 'compTitle') for header in headers: if header is not None: t = header.a.text.encode('utf-8') u = header.a.get('href') urls.append(u) titles.append(t) desclist = soup.findAll('div', 'compText aAbs') for desc in desclist: if desc is not None: d = desc.text.encode('utf-8') descriptions.append(d) size = len(titles) filename = query + '_Yahoo.txt' file = open(filename, "w") for i in range(0, size): result = struc() result.rank = i + 1 result.title = titles[i] result.description = descriptions[i] result.url = urls[i] file.write( str(i + 1) + ' ' + titles[i] + ' ' + descriptions[i] + '\n') yahooResults.append(result) return yahooResults
def aggregate(googleResults, yahooResults): l1 = [] l2 = [] uniques = struc() for each in googleResults: l1.append(each.url) for each in yahooResults: l2.append(each.url) uniqueList = union(l1, l2) # Common Results #print(intersect(l1,l2)) print str(len(l1)) + ' Google Results, ' + str( len(l2)) + ' Yahoo results, ' + str( len(uniqueList)) + ' Unique Results' googleURLs = [] yahooURLs = [] for result in googleResults: googleURLs.append(result.url) for result in yahooResults: yahooURLs.append(result.url) # Writing Aggregated Documents filename = 'UniqueDocuments.txt' file = open(filename, "w") count = 0 for link in uniqueList: if link in googleURLs: count = count + 1 i = googleURLs.index(link) file.write( str(count) + ' ' + googleResults[i].title + ' ' + googleResults[i].description + '\n') elif link in yahooURLs: count = count + 1 i = yahooURLs.index(link) file.write( str(count) + ' ' + yahooResults[i].title + ' ' + yahooResults[i].description + '\n') return uniqueList
def google_scrape(query): googleResults = [] address = "http://www.google.com/search?q=" + urllib.quote_plus( query) + "&num=" + str(numresults + 2) + "&hl=en&start=0" request = urllib2.Request( address, None, { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11' }) urlfile = urllib2.urlopen(request) page = urlfile.read() soup = BeautifulSoup(page) titles = [] descriptions = [] urls = [] headers = soup.findAll('div', 'rc') for header in headers: t = header.a.string.encode('utf-8') u = header.a.get('href') urls.append(u) titles.append(t) desclist = soup.findAll('span', 'st') for desc in desclist: d = desc.text.encode('utf-8') descriptions.append(d) size = len(titles) filename = query + '_Google.txt' file = open(filename, "w") for i in range(0, size): result = struc() result.rank = i + 1 result.title = titles[i] result.description = descriptions[i] result.url = urls[i] file.write( str(i + 1) + ' ' + titles[i] + ' ' + descriptions[i] + '\n') googleResults.append(result) return googleResults
def yahoo_scrape(query): yahooResults = [] address = "https://search.yahoo.com/search?p=" + query + "&n=" + str(numresults) request = urllib2.Request(address, None, {'User-Agent':'Mosilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'}) urlfile = urllib2.urlopen(request) page = urlfile.read() soup = BeautifulSoup(page) titles = [] descriptions = [] urls = [] headers = soup.findAll('div','compTitle') for header in headers: if header is not None: t = header.a.text.encode('utf-8') u = header.a.get('href') urls.append(u) titles.append(t) desclist = soup.findAll('div','compText aAbs') for desc in desclist: if desc is not None: d = desc.text.encode('utf-8') descriptions.append(d) size = len(titles) filename = query + '_Yahoo.txt' file = open(filename,"w") for i in range(0,size): result = struc() result.rank = i+1 result.title = titles[i] result.description = descriptions[i] result.url = urls[i] file.write(str(i+1) + ' ' + titles[i] + ' ' + descriptions[i] + '\n') yahooResults.append(result) return yahooResults
def aggregate(googleResults,yahooResults): l1 = [] l2 = [] uniques = struc() for each in googleResults: l1.append(each.url) for each in yahooResults: l2.append(each.url) uniqueList = union(l1,l2) # Common Results #print(intersect(l1,l2)) print str(len(l1)) + ' Google Results, ' + str(len(l2)) + ' Yahoo results, ' + str(len(uniqueList)) + ' Unique Results' googleURLs = [] yahooURLs = [] for result in googleResults: googleURLs.append(result.url) for result in yahooResults: yahooURLs.append(result.url) # Writing Aggregated Documents filename = 'UniqueDocuments.txt' file = open(filename,"w") count = 0 for link in uniqueList: if link in googleURLs: count = count + 1 i = googleURLs.index(link) file.write(str(count) + ' ' + googleResults[i].title + ' ' + googleResults[i].description + '\n') elif link in yahooURLs: count = count + 1 i = yahooURLs.index(link) file.write(str(count) + ' ' + yahooResults[i].title + ' ' + yahooResults[i].description + '\n') return uniqueList
def printResults(newRankList,uniqueList,googleResults,yahooResults,googleURLs,yahooURLs,ul,filename): for i in range(0,len(uniqueList)): result = struc() result.url = uniqueList[i] result.rank = ul[i] if uniqueList[i] in googleURLs: ind = googleURLs.index(uniqueList[i]) result.title = googleResults[ind].title result.description = googleResults[ind].description else: ind = yahooURLs.index(uniqueList[i]) result.title = yahooResults[ind].title result.description = yahooResults[ind].description newRankList.append(result) newRankList.sort(key=operator.attrgetter('rank')) file = open(filename,"w") count = 0 for result in newRankList: count = count + 1 file.write(str(count) + ' ' + result.title + ' ' + result.description + '\n') return
def google_scrape(query): googleResults = [] address = "http://www.google.com/search?q=" + urllib.quote_plus(query) + "&num=" + str(numresults+2) +"&hl=en&start=0" request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'}) urlfile = urllib2.urlopen(request) page = urlfile.read() soup = BeautifulSoup(page) titles = [] descriptions = [] urls = [] headers = soup.findAll('div','rc') for header in headers: t = header.a.string.encode('utf-8') u = header.a.get('href') urls.append(u) titles.append(t) desclist = soup.findAll('span','st') for desc in desclist: d = desc.text.encode('utf-8') descriptions.append(d) size = len(titles) filename = query + '_Google.txt' file = open(filename,"w") for i in range(0,size): result = struc() result.rank = i+1 result.title = titles[i] result.description = descriptions[i] result.url = urls[i] file.write(str(i+1) + ' ' + titles[i] + ' ' + descriptions[i] + '\n') googleResults.append(result) return googleResults