class SeleniumMiddleware(object): def __init__(self): self.settings = Settings() self.settings.CreateCommonSettings() def init(self, timeout=None, executable_path=None, proxy=None): self.file = FileIOMiddleware() self.timeout = timeout chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') if proxy is not None: chrome_options.add_argument('--proxy-server=http://{0}'.format(proxy)) self.browser = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options) self.load_timeout = self.browser.set_page_load_timeout(self.timeout) self.wait = WebDriverWait(self.browser, self.timeout) def close(self): self.browser.close() self.browser.quit() del self.browser, self.file, self.timeout, self.load_timeout, self.wait gc.collect() def chrome_request(self, url, log_path, proxy): self.init(timeout=self.settings.SELENIUM_TIMEOUT, executable_path=self.settings.CHROMEDRIVER_PATH, proxy=proxy) try: self.file.logger(log_path, 'Starting Chrome for: {0}'.format(url)) self.browser.get(url) return self.browser except TimeoutException: browser = self.browser self.file.logger(log_path, 'Chrome timeout for: {0}'.format(url)) self.close() return browser
class FileTransferMiddleware(): def __init__(self): self.settings = Settings() self.settings.CreateCommonSettings() logging.raiseExceptions = False def singleUpload(self, local_file_path, remote_file_path, host_name, user_name, password, port): connect_port = paramiko.Transport((host_name, port)) connect_port.connect(username=user_name, password=password) sftp = paramiko.SFTPClient.from_transport(connect_port) try: print 'start to transfer: {0}'.format(local_file_path) sftp.put(local_file_path, remote_file_path) connect_port.close() print 'finished to transfer: {0}'.format(local_file_path) print 'start to delete: {0}'.format(local_file_path) os.remove(local_file_path) print 'finished to delete: {0}'.format(local_file_path) except Exception as e: print 'Exception to transfer: {0} for {1}'.format( local_file_path, e.message) del connect_port, sftp gc.collect() def startUpload(self, local_diractory, remote_diractory, processes, host_name, user_name, password, port): isLocalDiractoryExists = os.path.exists(local_diractory) if isLocalDiractoryExists is False: print '{0} is not exits'.format(local_diractory) return files = os.listdir(local_diractory) if len(files) == 0: print 'No new file to upload in {0}'.format(local_diractory) return process = Pool(processes) for file in files: local_file_path = '{0}/{1}'.format(local_diractory, file) remote_file_path = '{0}/{1}'.format(remote_diractory, file) process.apply_async(self.singleUpload, args=(local_file_path, remote_file_path, host_name, user_name, password, port)) process.close() process.join() print 'Done' del files, process gc.collect()
class RequestsMiddleware(): def __init__(self): self.settings = Settings() self.settings.CreateCommonSettings() def init(self, headers=None, host=None, referer=None): self.file = FileIOMiddleware() self.requests = requests self.headers = headers if headers is None: self.headers = {} self.headers['Accept'] = self.settings.ACCEPT self.headers['Accept-Encoding'] = self.settings.ACCEPT_ENC0DING self.headers['Accept-Language'] = self.settings.ACCEPT_LANGUAGE self.headers['Cache-Control'] = self.settings.CACHE_CONTROL self.headers['Connection'] = self.settings.CONNECTION self.headers['Host'] = host self.headers[ 'Upgrade-Insecure-Requests'] = self.settings.UPGRADE_INSECURE_REQUESTS self.headers['Referer'] = referer self.headers['Pragma'] = self.settings.PRAGMA self.headers['User-Agent'] = self.settings.USER_AGENTS[ random.randint(0, len(self.settings.USER_AGENTS) - 1)] def requests_request(self, url, headers=None, host=None, referer=None): self.init(headers=headers, host=host, referer=referer) try: self.file.logger(self.settings.LOG_PATH, 'Starting Requests') res = self.requests.get(url=url, headers=self.headers) return res except Exception as e: self.file.logger(self.settings.LOG_PATH, 'Requests Timeout: {0}'.format(str(e.message))) def run_task(self, url_title=[], callback=callable, headers=None, host=None): self.file.logger(self.log_path, 'Start: {0}'.format(url_title[0])) print 'Start: {0}'.format(url_title[0]) response = self.requests_request(url_title[0], headers, host, url_title[0]) try: callback({ 'response': response, 'request_url': url_title[0], 'request_title': url_title[1] }) except Exception as e: self.file.logger( self.log_path, 'Exception: {0} for {1}'.format(e.message, url_title[0])) print 'Exception: {0} for {1}'.format(e.message, url_title[0]) del response, self.requests_request gc.collect() self.file.logger(self.log_path, 'End: {0}'.format(response.url)) print 'End: {0}'.format(response.url) del response, self.requests_request gc.collect() def start_requests(self, url_titles, processes, log_path, headers, host, proxy, callback=callable): self.file = FileIOMiddleware() self.content = [] self.log_path = log_path self.proxy = proxy process = Pool(processes) for url_title in url_titles: process.apply_async(self.run_task, args=(url_title, callback, headers, host)) process.close() process.join() self.file.logger(self.log_path, 'Done') print 'Done' del self.file, process gc.collect()