def search (keyword, max_results = 10): base_url = "http://www.imdb.com/find?s=all&q=%s" % (keyword) responses = helper.parallel_fetch([base_url]) exact_titles = get_exact_matches.search(responses.values()[0]) popular_titles = get_popular_matches.search(responses.values()[0]) partial_titles = get_partial_matches.search(responses.values()[0]) titles = [] if popular_titles: titles += [popular_titles.group(0)] if exact_titles: titles += [exact_titles.group(0)] if partial_titles: titles += [partial_titles.group(0)] titles = '\n'.join(titles).strip() if not titles: return [] urls = ['http://www.imdb.com/title/%s/usercomments' % link for link in get_movies.findall( titles )[:5]] responses = helper.parallel_fetch(urls) results = [] for url in responses: results += process_url(url, responses[url]) return results
def search (keyword, max_results = 5): base_url = "http://www99.epinions.com/search/?search_string=%s" % (keyword) responses = helper.parallel_fetch([base_url]) urls = set(['http://www99.epinions.com/reviews/%s' % link for link in get_movies.findall( responses.values()[0] )[:5]]) responses = helper.parallel_fetch(urls) reviews = [] for url in responses: reviews += ["http://www99.epinions.com/review/%s" % review for review in get_review.findall(responses[url])] reviews = set(reviews) responses = helper.parallel_fetch(reviews) results = [] for url in responses: results += process_url(url, responses[url]) return results
def search(keyword): print 'searching amazon' url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + keyword + "%s&x=0&y=0" soup = BeautifulSoup(urllib.urlopen(url).read()) #print soup urls = [x.a['href'] for x in soup.findAll('div', {"class": "productTitle"})][:5] responses = helper.parallel_fetch(urls) results = [] for url in responses: results += parse_url(url, responses[url]) #print results print 'returning amazon' return results
def fetch(self, requests, suffix): misses = 0 active_processes = [] googleRawResults = helper.parallel_fetch(requests.values()) logger.info(len(self.sitelist)) for site in self.sitelist: data = googleRawResults[requests[site]] urls = get_urls(data, site) logger.info(site) logger.info(len(urls)) if not urls: logger.warning('GOOGLE NOT RETURNING ANYTHING') logger.warning(requests[site]) # Don't cache throttle page if os.path.isfile('cache/%s.cache' % hash(requests[site])) and \ 'solving the above CAPTCHA' in data or not data: os.remove('cache/%s.cache' % hash(requests[site])) logger.error('Throttled: Cache cleared') # Sleep on misses, so bad searches don't DOS google misses += 1 continue t = multiprocessing.Process(None, parseBlogs.parseBlogs, site, (site, self.curlLock, urls, '%s/%s.%s' % (self.directory, site.replace('/', '_'), suffix), requests[site], ) ) t.url = requests[site] t.start() active_processes += [t] if self.max_processes <= len(active_processes): t = active_processes.pop(0) t.join(460) if t.is_alive(): t.terminate() logger.error('Terminated Process %s' % t.url) for process in active_processes: process.join(600) process.terminate() return misses
def search (keywords, max_results = 10): """ Function search(keywords) Searches buzillions for the current set of keywords, note buzillions seems to aggressively throttle when more then 3 links are opened ? parameters: keywords - A string with all the keywords to search Output: Outputs a dictionary with the following keys title - The title of this review title_section - The entire section containing the title in this review content - The content of this review link - The link that lead to this review """ base_url = ("http://www.buzzillions.com/x/s?N=4294811422&D=x&cat=&extra=all-product&Ntt=%s" % keywords) responses = helper.parallel_fetch([base_url]) urls = set([('http://www.buzzillions.com/reviews/%s' % link).split('#')[0] for link in get_movies.findall( responses.values()[0] )[:5]]) responses = helper.parallel_fetch(urls) results = [] for url in responses: results += process_url(url, responses[url]) return results
def search(keyword): base_url = "http://reviews.cnet.com/1770-5_7-0.html?query=%s&tag=srch" % (keyword) output = urllib.urlopen(base_url).read() soup = BeautifulSoup(output) urls = [ ("http://reviews.cnet.com" + x.find("a", {"class": "resultName"})["href"] + "?tag=contentMain;contentBody;1r") for x in soup.findAll("div", {"class": "resultInfo"}) ] urls = [x for x in urls if x.find("http://", 1) == -1] responses = helper.parallel_fetch(urls) results = [] for url in responses: results += [parse(url, responses[url])] return [x for x in results if x]
def search(keyword): query = urllib.urlencode( {'q': 'site:http://www.wired.com/reviews ' + keyword}) url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % ( query) search_results = urllib.urlopen(url) json = simplejson.loads(search_results.read()) results = json['responseData']['results'] urls = [x['url'] for x in results] responses = helper.parallel_fetch(urls) results = [] for url in responses: results += [parse(url, responses[url])] return [x for x in results if x]
def search(keyword): base_url = "http://reviews.cnet.com/1770-5_7-0.html?query=%s&tag=srch" % ( keyword) output = urllib.urlopen(base_url).read() soup = BeautifulSoup(output) urls = [("http://reviews.cnet.com" + x.find("a", {"class": "resultName"})['href'] + "?tag=contentMain;contentBody;1r") for x in soup.findAll("div", {"class": "resultInfo"})] urls = [x for x in urls if x.find("http://", 1) == -1] responses = helper.parallel_fetch(urls) results = [] for url in responses: results += [parse(url, responses[url])] return [x for x in results if x]
def parse_all(self, articles): data = helper.parallel_fetch(articles, replace_redirects=True) content = self.contentExtractor.parse_all(data) comments = self.commentExtractor.parse_all(data) results = {} for url in content: result_content = content[url] comment_url = "Error Url Not Found" result_comments = [error_comment] try: comment_url = self.commentExtractor.url_next[url] result_comments = comments[url] except KeyError: pass results[url] = Result(result_content, comment_url, result_comments) return results
def parse_all(self, url_site): self.url_site = url_site self.set_template_urls() self.url_data = {url:[] for url in url_site} self.found = set(url_site.keys()) iteration = 1 for iteration in xrange(1, self.max_iterations + 1): url_next = self.getNextUrls(iteration) if iteration == 1: self.url_next = url_next if not url_next: break if iteration > 1: time.sleep(10) nexturl_site = helper.parallel_fetch(url_next.values(), replace_redirects = True) self.url_site = {url : nexturl_site[newurl] for url, newurl in url_next.iteritems()} self.mapping = {url : newurl for url, newurl in url_next.iteritems()} self.found = set() self.process_urls() logger.info('iteration %s complete', iteration) logger.info('%s uncompleted', len(self.found)) if self.max_iterations > 1 and iteration == self.max_iterations: logger.error('Iterated %s times, possible infinite loop', iteration) return self.url_data
def get_extension(url): parsed_url = urlparse.urlparse(url) ext = parsed_url.path.split('.')[-1] if '/' in ext: return 'None' return ext toFetch = [url for url in url_extractor.findall(raw_json) if get_extension(url) not in frozenset(['css', 'gif', 'ico', 'jpg', 'png', 'swf', 'woff', 'xml'])] ## Filter urls ## Search interesting urls with open('debug/%s.%s' % (parsed_url.netloc, i), 'a') as f: f.write('\n' + '\n'.join(toFetch)) results = helper.parallel_fetch(toFetch) with open('text/%s.%s' % (parsed_url.netloc, i), 'w') as f: for url in results: f.write(url) f.write('\n') f.write(results[url]) f.write('\n-------\n') correct_urls = [url for url in results if sum([s in str(results[url]) for s in comments])] with open('results/%s.%s' % (parsed_url.netloc, i), 'w') as f: # Yahoo was the only one returned with multiple urls # but both urls had all the data soooo.... if not correct_urls: