Esempio n. 1
0
 def read(self, url='', post='', file_name=None):
     key = file_name if file_name else self.make_key(url, post)
     return common.get_file(os.path.join(self.location, key))
Esempio n. 2
0
    def __init__(self, **options):

        _dir = os.path.dirname(
            sys.executable) if 'python' not in sys.executable.lower(
            ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0]))

        self.config = dict(dir=_dir,
                           use_cache=True,
                           cache_path="cache",
                           use_proxy=True,
                           use_cookie=True,
                           timeout=45,
                           delay=0.1,
                           retries=0,
                           parse_log=True,
                           show_status_message=True,
                           max_redirects=3,
                           use_default_logging=True,
                           log_file='log.txt',
                           log_post=False,
                           log_headers=False)

        self.config.update(options)

        #expose important attributes
        self.dir = self.config.get('dir')
        if not os.path.exists(self.dir): os.makedirs(self.dir)

        #load settings from local settings.txt
        if os.path.exists(self.join_path('settings.txt')):
            self.config.update(
                json.loads(common.get_file(self.join_path('settings.txt'))))

        #create cache object
        cache_path = os.path.join(self.dir, self.config['cache_path'])
        self.cache = Cache(cache_path)
        """ logging settings """

        if self.config['use_default_logging']:
            _log_file_path = self.join_path(
                self.config['log_file']
            ) if self.config['log_file'] is not None else None

            # if _log_file_path:
            logging_config.set_default(log_file=_log_file_path, preserve=False)

        self.logger = logging.getLogger('scrapex')

        if self.config['show_status_message']:

            logger.info('start')

        atexit.register(self.__del__)

        self.proxy_manager = http.ProxyManager(
            proxy_file=self.join_path(self.config.get('proxy_file'))
            if self.config.get('proxy_file') else None,
            proxy_auth=self.config.get('proxy_auth'))

        self.client = http.Client(scraper=self)

        #create an async downloader for this scraper
        # self.downloader = Downloader(scraper=self, cc=3)

        #set flags
        self.writingflag = False

        #init the output db
        self.outdb = {}

        self._time_start = time.time()
Esempio n. 3
0
import sys, logging, time, json
import MySQLdb
from scrapex import Scraper, common
s = Scraper(show_status_message=False, use_logging_config=False, use_cache=True)
logger = logging.getLogger(__name__)

config = json.loads(common.get_file(s.join_path('config.txt')))

class DB(object):
	"""responsile for db operations"""
	def __init__(self):
		self.conn = None
		self.connect()
		
	def connect(self):
		try:
			self.conn = MySQLdb.connect(
				host=config['db']['host'],
				port = config['db']['port'],
	            user= config['db']['user'],
	            passwd=config['db']['password'],
	            db=config['db']['dbname'])

			self.conn.autocommit(True)
		
		except Exception as e:
			logger.exception(e)	
			raise e

	def execute(self,sql,params=None, retryonfail=True):
		""" execute a sql without fetching data """
Esempio n. 4
0
	def read(self, url='', post='', filename= None):
		key = filename if filename else 	self.make_key(url,post)
		return common.get_file(os.path.join(self.location, key))
Esempio n. 5
0
    def __init__(self, **options):

        _dir = os.path.dirname(
            sys.executable) if 'python' not in sys.executable.lower(
            ) else os.path.dirname(os.path.join(os.getcwd(), sys.argv[0]))

        self.config = dict(dir=_dir,
                           use_cache=True,
                           cache_path="cache",
                           use_proxy=True,
                           use_cookie=True,
                           timeout=45,
                           delay=0.1,
                           retries=0,
                           parse_log=True,
                           show_status_message=True,
                           max_redirects=3,
                           debug=True,
                           log_file='log.txt',
                           one_proxy=False)

        self.config.update(options)

        #expose important attributes
        self.dir = self.config.get('dir')
        if not os.path.exists(self.dir): os.makedirs(self.dir)

        #load settings from local settings.txt
        if os.path.exists(self.join_path('settings.txt')):
            self.config.update(
                json.loads(common.get_file(self.join_path('settings.txt'))))

        if self.config['use_cache']:
            cache_path = os.path.join(self.dir, self.config['cache_path'])

            self.cache = Cache(cache_path)
        else:
            self.cache = Cache('')
        """ logging settings """
        _log_file_path = self.join_path(
            self.config['log_file']
        ) if self.config['log_file'] is not None else None

        if self.config.get('use_logging_config') is not False:

            if os.path.exists(self.join_path('logging.config')):
                #use custom logging config
                logging.config.dictConfig(
                    json.loads(
                        common.get_file(self.join_path('logging.config'))))

            else:
                #use default logging config

                default_log_settings = logging_config.default_settings.copy()

                if _log_file_path:
                    default_log_settings['handlers']['file_handler'][
                        'filename'] = _log_file_path

                else:
                    #when log_file set to None, disable find_handler
                    del default_log_settings['handlers']['file_handler']
                    del default_log_settings['loggers'][
                        'requests.packages.urllib3.connectionpool']

                    default_log_settings['root']['handlers'] = ['console']

                # if self.config.get('debug') is True:
                # 	default_log_settings['handlers']['console']['level'] = 'DEBUG'

                logging.config.dictConfig(default_log_settings)

            #clear the log
            if not self.config.get('preserve_log'):
                if _log_file_path is not None:
                    self.put_file(_log_file_path, '')

        self.logger = logging.getLogger(__name__)

        if self.config['show_status_message']:

            self.logger.info('start')

        atexit.register(self.__del__)

        if (self.config.get('one_proxy') is True):
            self.proxy_manager = http.ProxyManager(
                proxy_file=self.join_path(self.config.get('proxy_file'))
                if self.config.get('proxy_file') else None,
                proxy_auth=self.config.get('proxy_auth'),
                one_proxy=True)
            self.logger.info('Selected proxy -> ' +
                             str(self.proxy_manager.proxies))
        else:
            self.proxy_manager = http.ProxyManager(
                proxy_file=self.join_path(self.config.get('proxy_file'))
                if self.config.get('proxy_file') else None,
                proxy_auth=self.config.get('proxy_auth'))

        self.client = http.Client(scraper=self)

        #create an async downloader for this scraper
        self.downloader = Downloader(scraper=self, cc=3)

        #set flags
        self.writingflag = False

        #init the output db
        self.outdb = {}

        self._time_start = time.time()
Esempio n. 6
0
	def __init__(self, **options):		

		_dir = os.path.dirname(sys.executable) if 'python' not in sys.executable.lower() else os.path.dirname( os.path.join( os.getcwd(), sys.argv[0] ) )
		

		self.config = dict(
			dir = _dir,			
			use_cache = True, 
			cache_path = "cache",
			use_proxy = True, 			
			use_cookie = True,						
			timeout = 45,
			delay = 0.1,
			retries = 0,
			parse_log = True,
			show_status_message = True,
			max_redirects = 3,
			
			use_default_logging = True,
			log_file = 'log.txt',
			log_post = False,
			log_headers = False
			
			)


		
		self.config.update(options)

		#expose important attributes
		self.dir = self.config.get('dir')
		if not os.path.exists(self.dir): os.makedirs(self.dir)		

		#load settings from local settings.txt
		if os.path.exists(self.join_path('settings.txt')):
			self.config.update(json.loads(common.get_file(self.join_path('settings.txt'))))

		
		#create cache object	
		cache_path = os.path.join(self.dir, self.config['cache_path'])	
		self.cache = Cache(cache_path)
	

		""" logging settings """

		if self.config['use_default_logging']:
			_log_file_path = self.join_path(self.config['log_file']) if self.config['log_file'] is not None else None

			# if _log_file_path:
			logging_config.set_default(log_file = _log_file_path, preserve = False)


		
		self.logger = logging.getLogger('scrapex')


		if self.config['show_status_message']:

			logger.info('start')
		
		atexit.register(self.__del__)

			
			
		self.proxy_manager = http.ProxyManager(proxy_file= self.join_path( self.config.get('proxy_file') ) if self.config.get('proxy_file') else None, proxy_auth=self.config.get('proxy_auth'))
		
		self.client = http.Client(scraper=self)

		#create an async downloader for this scraper
		# self.downloader = Downloader(scraper=self, cc=3)
		
		#set flags
		self.writingflag = False

		#init the output db
		self.outdb = {}

		self._time_start = time.time()