def build_search(self): """Build the headers and params for the search request for the search engine.""" self.search_params = {} # Don't set the offset parameter explicitly if the default search (no offset) is correct. start_search_position = None if self.current_page == 1 else str( int(self.num_results_per_page) * int(self.current_page)) if self.search_engine == 'google': self.parser = GoogleParser() self.search_params['q'] = self.current_keyword self.search_params['num'] = str(self.num_results_per_page) self.search_params['start'] = start_search_position if self.search_type == 'image': self.search_params.update({ 'oq': self.current_keyword, 'site': 'imghp', 'tbm': 'isch', 'source': 'hp', #'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'video': self.search_params.update({ 'tbm': 'vid', 'source': 'lnms', 'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'news': self.search_params.update({ 'tbm': 'nws', 'source': 'lnms', 'sa': 'X' }) elif self.search_engine == 'yandex': self.parser = YandexParser() self.search_params['text'] = self.current_keyword self.search_params['p'] = start_search_position if self.search_type == 'image': self.base_search_url = 'http://yandex.ru/images/search?' elif self.search_engine == 'bing': self.parser = BingParser() self.search_params['q'] = self.current_keyword self.search_params['first'] = start_search_position elif self.search_engine == 'yahoo': self.parser = YahooParser() self.search_params['p'] = self.current_keyword self.search_params['b'] = start_search_position self.search_params['ei'] = 'UTF-8' elif self.search_engine == 'baidu': self.parser = BaiduParser() self.search_params['wd'] = self.current_keyword self.search_params['pn'] = start_search_position self.search_params['ie'] = 'utf-8' elif self.search_engine == 'duckduckgo': self.parser = DuckduckgoParser() self.search_params['q'] = self.current_keyword
class HttpScrape(SearchEngineScrape, threading.Timer): """Offers a fast way to query any search engine using raw HTTP requests. Overrides the run() method of the superclass threading.Timer. Each thread represents a crawl for one Search Engine SERP page. Inheriting from threading.Timer allows the deriving class to delay execution of the run() method. This is a base class, Any supported search engine needs to subclass HttpScrape to implement this specific scrape type. Attributes: results: Returns the found results. """ # Several different User-Agents to diversify the requests. # Keep the User-Agents updated. Last update: 13th November 2014 # Get them here: http://techblog.willshouse.com/2012/01/03/most-common-user-agents/ USER_AGENTS = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36' ] def __init__(self, *args, time_offset=0.0, **kwargs): """Initialize an HttScrape object to scrape over blocking http. HttpScrape inherits from SearchEngineScrape and from threading.Timer. """ threading.Timer.__init__(self, time_offset, self.search) SearchEngineScrape.__init__(self, *args, **kwargs) # Bind the requests module to this instance such that each # instance may have an own proxy self.requests = __import__('requests') # initialize the GET parameters for the search request self.search_params = {} # initialize the HTTP headers of the search request # to some base values that mozilla uses with requests. # the Host and User-Agent field need to be set additionally. self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', } # the mode self.scrapemethod = 'http' def set_proxy(self): """Setup a socks connection for the socks module bound to this instance. Args: proxy: Namedtuple, Proxy to use for this thread. """ def create_connection(address, timeout=None, source_address=None): sock = socks.socksocket() sock.connect(address) return sock pmapping = { 'socks4': 1, 'socks5': 2, 'http': 3 } # Patch the socket module # rdns is by default on true. Never use rnds=False with TOR, otherwise you are screwed! socks.setdefaultproxy(pmapping.get(self.proxy.proto), self.proxy.host, int(self.proxy.port), rdns=True) socks.wrap_module(socket) socket.create_connection = create_connection def switch_proxy(self, proxy): super().switch_proxy() def handle_request_denied(self, status_code): super().handle_request_denied() raise Exception('Request not allowed') def build_search(self): """Build the headers and params for the search request for the search engine.""" self.search_params = {} # Don't set the offset parameter explicitly if the default search (no offset) is correct. start_search_position = None if self.current_page == 1 else str(int(self.num_results_per_page) * int(self.current_page)) if self.search_engine == 'google': self.parser = GoogleParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword self.search_params['num'] = str(self.num_results_per_page) self.search_params['start'] = start_search_position if self.search_type == 'image': self.search_params.update({ 'oq': self.current_keyword, 'site': 'imghp', 'tbm': 'isch', 'source': 'hp', #'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'video': self.search_params.update({ 'tbm': 'vid', 'source': 'lnms', 'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'news': self.search_params.update({ 'tbm': 'nws', 'source': 'lnms', 'sa': 'X' }) elif self.search_engine == 'yandex': self.parser = YandexParser(searchtype=self.search_type) self.search_params['text'] = self.current_keyword self.search_params['p'] = start_search_position elif self.search_engine == 'bing': self.parser = BingParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword self.search_params['first'] = start_search_position elif self.search_engine == 'yahoo': self.parser = YahooParser(searchtype=self.search_type) self.search_params['p'] = self.current_keyword self.search_params['b'] = start_search_position self.search_params['ei'] = 'UTF-8' elif self.search_engine == 'baidu': self.parser = BaiduParser(searchtype=self.search_type) self.search_params['wd'] = self.current_keyword self.search_params['pn'] = start_search_position self.search_params['ie'] = 'utf-8' elif self.search_engine == 'duckduckgo': self.parser = DuckduckgoParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword def search(self, *args, rand=False, **kwargs): """The actual search for the search engine.""" self.build_search() if rand: self.headers['User-Agent'] = random.choice(self.USER_AGENTS) html = get_cached(self.current_keyword, self.search_engine, 'http') if not html: try: if Config['GLOBAL'].getint('verbosity', 0) > 1: logger.info('[HTTP] Base_url: {base_url}, headers={headers}, params={params}'.format( base_url=self.base_search_url, headers=self.headers, params=self.search_params) ) request = self.requests.get(self.base_search_url, headers=self.headers, params=self.search_params, timeout=3.0) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not request.ok: logger.error('HTTP Error: {}'.format(request.status_code)) self.handle_request_denied(request.status_code) return False html = request.text # cache fresh results with self.cache_lock: cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod) self.parser.parse(html) self.store() out(str(self.parser), lvl=2) def run(self): args = [] kwargs = {} kwargs['rand'] = False SearchEngineScrape.blocking_search(self, self.search, *args, **kwargs)
def build_search(self): """Build the headers and params for the search request for the search engine.""" self.search_params = {} # Don't set the offset parameter explicitly if the default search (no offset) is correct. start_search_position = None if self.current_page == 1 else str(int(self.num_results_per_page) * int(self.current_page)) if self.search_engine == 'google': self.parser = GoogleParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword self.search_params['num'] = str(self.num_results_per_page) self.search_params['start'] = start_search_position if self.search_type == 'image': self.search_params.update({ 'oq': self.current_keyword, 'site': 'imghp', 'tbm': 'isch', 'source': 'hp', #'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'video': self.search_params.update({ 'tbm': 'vid', 'source': 'lnms', 'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'news': self.search_params.update({ 'tbm': 'nws', 'source': 'lnms', 'sa': 'X' }) elif self.search_engine == 'yandex': self.parser = YandexParser(searchtype=self.search_type) self.search_params['text'] = self.current_keyword self.search_params['p'] = start_search_position elif self.search_engine == 'bing': self.parser = BingParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword self.search_params['first'] = start_search_position elif self.search_engine == 'yahoo': self.parser = YahooParser(searchtype=self.search_type) self.search_params['p'] = self.current_keyword self.search_params['b'] = start_search_position self.search_params['ei'] = 'UTF-8' elif self.search_engine == 'baidu': self.parser = BaiduParser(searchtype=self.search_type) self.search_params['wd'] = self.current_keyword self.search_params['pn'] = start_search_position self.search_params['ie'] = 'utf-8' elif self.search_engine == 'duckduckgo': self.parser = DuckduckgoParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword
class HttpScrape(SearchEngineScrape, threading.Timer): """Offers a fast way to query any search engine using raw HTTP requests. Overrides the run() method of the superclass threading.Timer. Each thread represents a crawl for one Search Engine SERP page. Inheriting from threading.Timer allows the deriving class to delay execution of the run() method. This is a base class, Any supported search engine needs to subclass HttpScrape to implement this specific scrape type. Attributes: results: Returns the found results. """ # Several different User-Agents to diversify the requests. # Keep the User-Agents updated. Last update: 13th November 2014 # Get them here: http://techblog.willshouse.com/2012/01/03/most-common-user-agents/ USER_AGENTS = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36', 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36' ] def __init__(self, *args, time_offset=0.0, **kwargs): """Initialize an HttScrape object to scrape over blocking http. HttpScrape inherits from SearchEngineScrape and from threading.Timer. """ threading.Timer.__init__(self, time_offset, self.search) SearchEngineScrape.__init__(self, *args, **kwargs) # Bind the requests module to this instance such that each # instance may have an own proxy self.requests = __import__('requests') # initialize the GET parameters for the search request self.search_params = {} # initialize the HTTP headers of the search request # to some base values that mozilla uses with requests. # the Host and User-Agent field need to be set additionally. self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', } def set_proxy(self): """Setup a socks connection for the socks module bound to this instance. Args: proxy: Namedtuple, Proxy to use for this thread. """ def create_connection(address, timeout=None, source_address=None): sock = socks.socksocket() sock.connect(address) return sock pmapping = {'socks4': 1, 'socks5': 2, 'http': 3} # Patch the socket module # rdns is by default on true. Never use rnds=False with TOR, otherwise you are screwed! socks.setdefaultproxy(pmapping.get(self.proxy.proto), self.proxy.host, int(self.proxy.port), rdns=True) socks.wrap_module(socket) socket.create_connection = create_connection def switch_proxy(self, proxy): super().switch_proxy() def handle_request_denied(self, status_code): raise Exception('Request not allowed') def build_search(self): """Build the headers and params for the search request for the search engine.""" self.search_params = {} # Don't set the offset parameter explicitly if the default search (no offset) is correct. start_search_position = None if self.current_page == 1 else str( int(self.num_results_per_page) * int(self.current_page)) if self.search_engine == 'google': self.parser = GoogleParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword self.search_params['num'] = str(self.num_results_per_page) self.search_params['start'] = start_search_position if self.search_type == 'image': self.search_params.update({ 'oq': self.current_keyword, 'site': 'imghp', 'tbm': 'isch', 'source': 'hp', #'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'video': self.search_params.update({ 'tbm': 'vid', 'source': 'lnms', 'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'news': self.search_params.update({ 'tbm': 'nws', 'source': 'lnms', 'sa': 'X' }) elif self.search_engine == 'yandex': self.parser = YandexParser(searchtype=self.search_type) self.search_params['text'] = self.current_keyword self.search_params['p'] = start_search_position elif self.search_engine == 'bing': self.parser = BingParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword self.search_params['first'] = start_search_position elif self.search_engine == 'yahoo': self.parser = YahooParser(searchtype=self.search_type) self.search_params['p'] = self.current_keyword self.search_params['b'] = start_search_position self.search_params['ei'] = 'UTF-8' elif self.search_engine == 'baidu': self.parser = BaiduParser(searchtype=self.search_type) self.search_params['wd'] = self.current_keyword self.search_params['pn'] = start_search_position self.search_params['ie'] = 'utf-8' elif self.search_engine == 'duckduckgo': self.parser = DuckduckgoParser(searchtype=self.search_type) self.search_params['q'] = self.current_keyword def search(self, *args, rand=False, **kwargs): """The actual search for the search engine.""" self.build_search() if rand: self.headers['User-Agent'] = random.choice(self.USER_AGENTS) html = get_cached(self.current_keyword, self.base_search_url, params=self.search_params) if not html: try: if Config['GLOBAL'].getint('verbosity', 0) > 1: logger.info( '[HTTP] Base_url: {base_url}, headers={headers}, params={params}' .format(base_url=self.base_search_url, headers=self.headers, params=self.search_params)) r = self.requests.get(self.base_search_url, headers=self.headers, params=self.search_params, timeout=3.0) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) self.handle_request_denied(r.status_code) return False html = r.text # cache fresh results cache_results(html, self.current_keyword, url=self.base_search_url, params=self.search_params) self.parser.parse(html) self.store() print(self.parser) def run(self): args = [] kwargs = {} kwargs['rand'] = False SearchEngineScrape.blocking_search(self, self.search, *args, **kwargs)