def read(self, url='', post='', file_name=None): key = file_name if file_name else self.make_key(url, post) return common.get_file(os.path.join(self.location, key))
def __init__(self, **options): _dir = os.path.dirname( sys.executable) if 'python' not in sys.executable.lower( ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0])) self.config = dict(dir=_dir, use_cache=True, cache_path="cache", use_proxy=True, use_cookie=True, timeout=45, delay=0.1, retries=0, parse_log=True, show_status_message=True, max_redirects=3, use_default_logging=True, log_file='log.txt', log_post=False, log_headers=False) self.config.update(options) #expose important attributes self.dir = self.config.get('dir') if not os.path.exists(self.dir): os.makedirs(self.dir) #load settings from local settings.txt if os.path.exists(self.join_path('settings.txt')): self.config.update( json.loads(common.get_file(self.join_path('settings.txt')))) #create cache object cache_path = os.path.join(self.dir, self.config['cache_path']) self.cache = Cache(cache_path) """ logging settings """ if self.config['use_default_logging']: _log_file_path = self.join_path( self.config['log_file'] ) if self.config['log_file'] is not None else None # if _log_file_path: logging_config.set_default(log_file=_log_file_path, preserve=False) self.logger = logging.getLogger('scrapex') if self.config['show_status_message']: logger.info('start') atexit.register(self.__del__) self.proxy_manager = http.ProxyManager( proxy_file=self.join_path(self.config.get('proxy_file')) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth')) self.client = http.Client(scraper=self) #create an async downloader for this scraper # self.downloader = Downloader(scraper=self, cc=3) #set flags self.writingflag = False #init the output db self.outdb = {} self._time_start = time.time()
import sys, logging, time, json import MySQLdb from scrapex import Scraper, common s = Scraper(show_status_message=False, use_logging_config=False, use_cache=True) logger = logging.getLogger(__name__) config = json.loads(common.get_file(s.join_path('config.txt'))) class DB(object): """responsile for db operations""" def __init__(self): self.conn = None self.connect() def connect(self): try: self.conn = MySQLdb.connect( host=config['db']['host'], port = config['db']['port'], user= config['db']['user'], passwd=config['db']['password'], db=config['db']['dbname']) self.conn.autocommit(True) except Exception as e: logger.exception(e) raise e def execute(self,sql,params=None, retryonfail=True): """ execute a sql without fetching data """
def read(self, url='', post='', filename= None): key = filename if filename else self.make_key(url,post) return common.get_file(os.path.join(self.location, key))
def __init__(self, **options): _dir = os.path.dirname( sys.executable) if 'python' not in sys.executable.lower( ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0])) self.config = dict(dir=_dir, use_cache=True, cache_path="cache", use_proxy=True, use_cookie=True, timeout=45, delay=0.1, retries=0, parse_log=True, show_status_message=True, max_redirects=3, debug=True, log_file='log.txt', one_proxy=False) self.config.update(options) #expose important attributes self.dir = self.config.get('dir') if not os.path.exists(self.dir): os.makedirs(self.dir) #load settings from local settings.txt if os.path.exists(self.join_path('settings.txt')): self.config.update( json.loads(common.get_file(self.join_path('settings.txt')))) if self.config['use_cache']: cache_path = os.path.join(self.dir, self.config['cache_path']) self.cache = Cache(cache_path) else: self.cache = Cache('') """ logging settings """ _log_file_path = self.join_path( self.config['log_file'] ) if self.config['log_file'] is not None else None if self.config.get('use_logging_config') is not False: if os.path.exists(self.join_path('logging.config')): #use custom logging config logging.config.dictConfig( json.loads( common.get_file(self.join_path('logging.config')))) else: #use default logging config default_log_settings = logging_config.default_settings.copy() if _log_file_path: default_log_settings['handlers']['file_handler'][ 'filename'] = _log_file_path else: #when log_file set to None, disable find_handler del default_log_settings['handlers']['file_handler'] del default_log_settings['loggers'][ 'requests.packages.urllib3.connectionpool'] default_log_settings['root']['handlers'] = ['console'] # if self.config.get('debug') is True: # default_log_settings['handlers']['console']['level'] = 'DEBUG' logging.config.dictConfig(default_log_settings) #clear the log if not self.config.get('preserve_log'): if _log_file_path is not None: self.put_file(_log_file_path, '') self.logger = logging.getLogger(__name__) if self.config['show_status_message']: self.logger.info('start') atexit.register(self.__del__) if (self.config.get('one_proxy') is True): self.proxy_manager = http.ProxyManager( proxy_file=self.join_path(self.config.get('proxy_file')) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth'), one_proxy=True) self.logger.info('Selected proxy -> ' + str(self.proxy_manager.proxies)) else: self.proxy_manager = http.ProxyManager( proxy_file=self.join_path(self.config.get('proxy_file')) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth')) self.client = http.Client(scraper=self) #create an async downloader for this scraper self.downloader = Downloader(scraper=self, cc=3) #set flags self.writingflag = False #init the output db self.outdb = {} self._time_start = time.time()
def __init__(self, **options): _dir = os.path.dirname(sys.executable) if 'python' not in sys.executable.lower() else os.path.dirname( os.path.join( os.getcwd(), sys.argv[0] ) ) self.config = dict( dir = _dir, use_cache = True, cache_path = "cache", use_proxy = True, use_cookie = True, timeout = 45, delay = 0.1, retries = 0, parse_log = True, show_status_message = True, max_redirects = 3, use_default_logging = True, log_file = 'log.txt', log_post = False, log_headers = False ) self.config.update(options) #expose important attributes self.dir = self.config.get('dir') if not os.path.exists(self.dir): os.makedirs(self.dir) #load settings from local settings.txt if os.path.exists(self.join_path('settings.txt')): self.config.update(json.loads(common.get_file(self.join_path('settings.txt')))) #create cache object cache_path = os.path.join(self.dir, self.config['cache_path']) self.cache = Cache(cache_path) """ logging settings """ if self.config['use_default_logging']: _log_file_path = self.join_path(self.config['log_file']) if self.config['log_file'] is not None else None # if _log_file_path: logging_config.set_default(log_file = _log_file_path, preserve = False) self.logger = logging.getLogger('scrapex') if self.config['show_status_message']: logger.info('start') atexit.register(self.__del__) self.proxy_manager = http.ProxyManager(proxy_file= self.join_path( self.config.get('proxy_file') ) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth')) self.client = http.Client(scraper=self) #create an async downloader for this scraper # self.downloader = Downloader(scraper=self, cc=3) #set flags self.writingflag = False #init the output db self.outdb = {} self._time_start = time.time()