def __init__(self, config, *args, time_offset=0.0, **kwargs): """Initialize an HttScrape object to scrape over blocking http. HttpScrape inherits from SearchEngineScrape and from threading.Timer. """ threading.Timer.__init__(self, time_offset, self.search) SearchEngineScrape.__init__(self, config, *args, **kwargs) # Bind the requests module to this instance such that each # instance may have an own proxy self.requests = __import__('requests') # initialize the GET parameters for the search request self.search_params = {} # initialize the HTTP headers of the search request # to some base values that mozilla uses with requests. # the Host and User-Agent field need to be set additionally. self.headers = headers # the mode self.scrape_method = 'http' # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine( self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) if self.search_engine_name == 'blekko': logger.critical('blekko doesnt support http mode.') self.startable = False
def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs): """Create a new SelScraper thread Instance. Args: captcha_lock: To sync captcha solving (stdin) proxy: Optional, if set, use the proxy to route all scrapign through it. browser_num: A unique, semantic number for each thread. """ self.search_input = None threading.Thread.__init__(self) SearchEngineScrape.__init__(self, *args, **kwargs) self.browser_type = Config['SELENIUM'].get('sel_browser', 'chrome').lower() self.browser_num = browser_num self.captcha_lock = captcha_lock self.scrape_method = 'selenium' self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None) self.search_param_values = self._get_search_param_values() # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__)
def __init__(self, config, *args, time_offset=0.0, **kwargs): """Initialize an HttScrape object to scrape over blocking http. HttpScrape inherits from SearchEngineScrape and from threading.Timer. """ threading.Timer.__init__(self, time_offset, self.search) SearchEngineScrape.__init__(self, config, *args, **kwargs) # Bind the requests module to this instance such that each # instance may have an own proxy self.requests = __import__('requests') # initialize the GET parameters for the search request self.search_params = {} # initialize the HTTP headers of the search request # to some base values that mozilla uses with requests. # the Host and User-Agent field need to be set additionally. self.headers = headers # the mode self.scrape_method = 'http' # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) if self.search_engine_name == 'blekko': logger.critical('blekko does not support http mode.') self.startable = False