def clear_cookies(self): self.client = http.Client(scraper=self) return self
def __init__(self, **options): _dir = os.path.dirname( sys.executable) if 'python' not in sys.executable.lower( ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0])) self.config = dict(dir=_dir, use_cache=True, cache_path="cache", use_proxy=True, use_cookie=True, timeout=45, delay=0.1, retries=0, parse_log=True, show_status_message=True, max_redirects=3, use_default_logging=True, log_file='log.txt', log_post=False, log_headers=False) self.config.update(options) #expose important attributes self.dir = self.config.get('dir') if not os.path.exists(self.dir): os.makedirs(self.dir) #load settings from local settings.txt if os.path.exists(self.join_path('settings.txt')): self.config.update( json.loads(common.get_file(self.join_path('settings.txt')))) #create cache object cache_path = os.path.join(self.dir, self.config['cache_path']) self.cache = Cache(cache_path) """ logging settings """ if self.config['use_default_logging']: _log_file_path = self.join_path( self.config['log_file'] ) if self.config['log_file'] is not None else None # if _log_file_path: logging_config.set_default(log_file=_log_file_path, preserve=False) self.logger = logging.getLogger('scrapex') if self.config['show_status_message']: logger.info('start') atexit.register(self.__del__) self.proxy_manager = http.ProxyManager( proxy_file=self.join_path(self.config.get('proxy_file')) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth')) self.client = http.Client(scraper=self) #create an async downloader for this scraper # self.downloader = Downloader(scraper=self, cc=3) #set flags self.writingflag = False #init the output db self.outdb = {} self._time_start = time.time()
def __init__(self, **options): _dir = os.path.dirname( sys.executable) if 'python' not in sys.executable.lower( ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0])) self.config = dict(dir=_dir, use_cache=True, cache_path="cache", use_proxy=True, use_cookie=True, timeout=45, delay=0.1, retries=0, parse_log=True, show_status_message=True, max_redirects=3, debug=True, log_file='log.txt', one_proxy=False) self.config.update(options) #expose important attributes self.dir = self.config.get('dir') if not os.path.exists(self.dir): os.makedirs(self.dir) #load settings from local settings.txt if os.path.exists(self.join_path('settings.txt')): self.config.update( json.loads(common.get_file(self.join_path('settings.txt')))) if self.config['use_cache']: cache_path = os.path.join(self.dir, self.config['cache_path']) self.cache = Cache(cache_path) else: self.cache = Cache('') """ logging settings """ _log_file_path = self.join_path( self.config['log_file'] ) if self.config['log_file'] is not None else None if self.config.get('use_logging_config') is not False: if os.path.exists(self.join_path('logging.config')): #use custom logging config logging.config.dictConfig( json.loads( common.get_file(self.join_path('logging.config')))) else: #use default logging config default_log_settings = logging_config.default_settings.copy() if _log_file_path: default_log_settings['handlers']['file_handler'][ 'filename'] = _log_file_path else: #when log_file set to None, disable find_handler del default_log_settings['handlers']['file_handler'] del default_log_settings['loggers'][ 'requests.packages.urllib3.connectionpool'] default_log_settings['root']['handlers'] = ['console'] # if self.config.get('debug') is True: # default_log_settings['handlers']['console']['level'] = 'DEBUG' logging.config.dictConfig(default_log_settings) #clear the log if not self.config.get('preserve_log'): if _log_file_path is not None: self.put_file(_log_file_path, '') self.logger = logging.getLogger(__name__) if self.config['show_status_message']: self.logger.info('start') atexit.register(self.__del__) if (self.config.get('one_proxy') is True): self.proxy_manager = http.ProxyManager( proxy_file=self.join_path(self.config.get('proxy_file')) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth'), one_proxy=True) self.logger.info('Selected proxy -> ' + str(self.proxy_manager.proxies)) else: self.proxy_manager = http.ProxyManager( proxy_file=self.join_path(self.config.get('proxy_file')) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth')) self.client = http.Client(scraper=self) #create an async downloader for this scraper self.downloader = Downloader(scraper=self, cc=3) #set flags self.writingflag = False #init the output db self.outdb = {} self._time_start = time.time()