Beispiel #1
0
 def clear_cookies(self):
     self.client = http.Client(scraper=self)
     return self
Beispiel #2
0
    def __init__(self, **options):

        _dir = os.path.dirname(
            sys.executable) if 'python' not in sys.executable.lower(
            ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0]))

        self.config = dict(dir=_dir,
                           use_cache=True,
                           cache_path="cache",
                           use_proxy=True,
                           use_cookie=True,
                           timeout=45,
                           delay=0.1,
                           retries=0,
                           parse_log=True,
                           show_status_message=True,
                           max_redirects=3,
                           use_default_logging=True,
                           log_file='log.txt',
                           log_post=False,
                           log_headers=False)

        self.config.update(options)

        #expose important attributes
        self.dir = self.config.get('dir')
        if not os.path.exists(self.dir): os.makedirs(self.dir)

        #load settings from local settings.txt
        if os.path.exists(self.join_path('settings.txt')):
            self.config.update(
                json.loads(common.get_file(self.join_path('settings.txt'))))

        #create cache object
        cache_path = os.path.join(self.dir, self.config['cache_path'])
        self.cache = Cache(cache_path)
        """ logging settings """

        if self.config['use_default_logging']:
            _log_file_path = self.join_path(
                self.config['log_file']
            ) if self.config['log_file'] is not None else None

            # if _log_file_path:
            logging_config.set_default(log_file=_log_file_path, preserve=False)

        self.logger = logging.getLogger('scrapex')

        if self.config['show_status_message']:

            logger.info('start')

        atexit.register(self.__del__)

        self.proxy_manager = http.ProxyManager(
            proxy_file=self.join_path(self.config.get('proxy_file'))
            if self.config.get('proxy_file') else None,
            proxy_auth=self.config.get('proxy_auth'))

        self.client = http.Client(scraper=self)

        #create an async downloader for this scraper
        # self.downloader = Downloader(scraper=self, cc=3)

        #set flags
        self.writingflag = False

        #init the output db
        self.outdb = {}

        self._time_start = time.time()
Beispiel #3
0
    def __init__(self, **options):

        _dir = os.path.dirname(
            sys.executable) if 'python' not in sys.executable.lower(
            ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0]))

        self.config = dict(dir=_dir,
                           use_cache=True,
                           cache_path="cache",
                           use_proxy=True,
                           use_cookie=True,
                           timeout=45,
                           delay=0.1,
                           retries=0,
                           parse_log=True,
                           show_status_message=True,
                           max_redirects=3,
                           debug=True,
                           log_file='log.txt',
                           one_proxy=False)

        self.config.update(options)

        #expose important attributes
        self.dir = self.config.get('dir')
        if not os.path.exists(self.dir): os.makedirs(self.dir)

        #load settings from local settings.txt
        if os.path.exists(self.join_path('settings.txt')):
            self.config.update(
                json.loads(common.get_file(self.join_path('settings.txt'))))

        if self.config['use_cache']:
            cache_path = os.path.join(self.dir, self.config['cache_path'])

            self.cache = Cache(cache_path)
        else:
            self.cache = Cache('')
        """ logging settings """
        _log_file_path = self.join_path(
            self.config['log_file']
        ) if self.config['log_file'] is not None else None

        if self.config.get('use_logging_config') is not False:

            if os.path.exists(self.join_path('logging.config')):
                #use custom logging config
                logging.config.dictConfig(
                    json.loads(
                        common.get_file(self.join_path('logging.config'))))

            else:
                #use default logging config

                default_log_settings = logging_config.default_settings.copy()

                if _log_file_path:
                    default_log_settings['handlers']['file_handler'][
                        'filename'] = _log_file_path

                else:
                    #when log_file set to None, disable find_handler
                    del default_log_settings['handlers']['file_handler']
                    del default_log_settings['loggers'][
                        'requests.packages.urllib3.connectionpool']

                    default_log_settings['root']['handlers'] = ['console']

                # if self.config.get('debug') is True:
                # 	default_log_settings['handlers']['console']['level'] = 'DEBUG'

                logging.config.dictConfig(default_log_settings)

            #clear the log
            if not self.config.get('preserve_log'):
                if _log_file_path is not None:
                    self.put_file(_log_file_path, '')

        self.logger = logging.getLogger(__name__)

        if self.config['show_status_message']:

            self.logger.info('start')

        atexit.register(self.__del__)

        if (self.config.get('one_proxy') is True):
            self.proxy_manager = http.ProxyManager(
                proxy_file=self.join_path(self.config.get('proxy_file'))
                if self.config.get('proxy_file') else None,
                proxy_auth=self.config.get('proxy_auth'),
                one_proxy=True)
            self.logger.info('Selected proxy -> ' +
                             str(self.proxy_manager.proxies))
        else:
            self.proxy_manager = http.ProxyManager(
                proxy_file=self.join_path(self.config.get('proxy_file'))
                if self.config.get('proxy_file') else None,
                proxy_auth=self.config.get('proxy_auth'))

        self.client = http.Client(scraper=self)

        #create an async downloader for this scraper
        self.downloader = Downloader(scraper=self, cc=3)

        #set flags
        self.writingflag = False

        #init the output db
        self.outdb = {}

        self._time_start = time.time()