def run_command(self, id=-1, command=None, d=None, caller=None): """ Sends a command to the proper MessagingQueue with the proper data object and command string. After a successful send if the object is not processing a QUIT command, the HitParadeBotCommandProcessor then listens to an incoming queue where the WebScraper sends a reply as to the success or failure of the command. The QUIT command is simply sent and all queues associated with the id are destroyed. :param id: int ID of the webscraper. :param command: str Command to perform. :param d: dict data dictionary of the command request :param caller: Thread calling this action usually the main thread. Used for logging purposes. :return: """ MessagingQueue.send_msg(id=id, direction='in', cmd=command, d=d, caller=caller) command = command val = None if not command == 'QUIT': command, val = MessagingQueue.wait_for_msg(direction='out', id=id, caller=caller) print('command %s ' % command) return command, val
def respond(self, obj=None, command=None): """ Sends a message to the main thread. :param obj: dict to send to the main thread :param command: str command being used. :return: """ MessagingQueue.send_msg(id=self.get_id(), direction='out', cmd=command, d=obj, caller=str(self.get_id()))
def scrape_data(self, id=-1, scraping_props=None, caller=None, **kwargs): """ Atomic Scraping Method Command that tells a WebScraper to Scrape Data from a URL. :param id: int id of the WebScraper to command. :param scraping_props: dict all properties to scrape. :param caller: str usually main thread :return: [2 values] command, value """ message_dict = dict() message_dict['scraper_logins'] = scraping_props.get( 'scraper_logins', []) message_dict['data_selectors'] = scraping_props.get( 'data_selectors', None) message_dict['force_refresh'] = scraping_props.get( 'force_refresh', False) message_dict['web_driver'] = scraping_props.get('web_driver', None) message_dict['command'] = 'SCRAPE' message_dict['type_id'] = 'Scraper' message_dict['unique_id'] = MessagingQueue.unique_id( cache_manager=self.cache_manager) message_dict.update(kwargs) return self.run_command(id=id, command='SCRAPE', d=message_dict, caller=caller)
def login(self, id=-1, selectors=None, login_page_selectors=None, login_button_selector=None, caller=None, **kwargs): """ Atomic Scraping Method Sends the login command to a WebScraper object. :param id: int id of the WebScraper to send to. :param selectors: list[str] list of selectors for username/password :param login_page_selectors: list[str] list of selectors to identify if you are on the login page. :param login_button_selector: str selector indicating the button to click to login. :param caller: str of the calling Thread - usually the Main thread. :return: [2 values] command, value """ message_dict = dict() message_dict['selectors'] = selectors message_dict['login_page_selectors'] = login_page_selectors message_dict['login_button_selector'] = login_button_selector message_dict['command'] = 'LOGIN' message_dict['type_id'] = 'ScraperLogin' message_dict['unique_id'] = MessagingQueue.unique_id( cache_manager=self.cache_manager) message_dict.update(kwargs) return self.run_command(id=id, command='LOGIN', d=message_dict, caller=caller)
def execute_web_command(self, **kwargs): if self.command == 'CLICK_SCRAPE': # return self.get_result_message(self.action.exec(**kwargs)) el = self.action.selector for i in range(self.action.retry_count): if not el is None: try: if isinstance(self.action.selector, str): try: if self.action.wait_for: while el is None: el = self.action.driver.find_element_by_css_selector( self.action.selector) time.sleep(self.action.time_delay) else: el = self.action.driver.find_element_by_css_selector( self.action.selector) except: traceback.print_exc() pass if not el is None: el.click() self.successful = True else: if not el is None: el.click() self.successful = True else: print('element passed in is null.') except: traceback.print_exc() pass success_value = not el is None and self.is_successful() if success_value: MessagingQueue.send_msg_nowait( id=self.action.web_driver.id, direction='meta', cmd='SUCCESS', d={'selector': self.action.selector}, caller='WebScraperComponentScrapeCommand') return self.get_result_message(success_value)
def open_url(self, id=-1, url=None, caller=None, **kwargs): message_dict = dict() message_dict['command'] = 'OPEN' message_dict['type_id'] = 'ScraperAction' message_dict['scraper_url'] = url message_dict['unique_id'] = MessagingQueue.unique_id( cache_manager=self.cache_manager) message_dict.update(kwargs) return self.run_command(id=id, command='OPEN', d=message_dict, caller=caller)
def __init__(self, **kwargs): """ Constructor HitParadeBot which is a Thread IS-A Thread :param kwargs: The following properties are pulled from kwargs scrapers - [] list of active scrapers default is [] bot_data - dict of data for the bot to use. default is {} scraper_type - FirefoxWebScraper or ChromeWebScraper are the options. default FireFoxWebScraper start_url - url to start with. Default is None memory_threshold - memory threshold to which the web scraper will be rebooted if it takes up too much memory. recurring - bool True if the Bot will be running in a loop for a certain amount of time, false otherwise. headless - bool True if the WebScraper should be headless and False if the WebScraper can be show. Default is False. timeout - int timeout in seconds of a url. Default HitParadeBot.DEFAULT_TIMEOUT retry - int number of time s to retry a url or task. Default HitParadeBot.DEFAULT_RETRY output_connector - HitParadeOUtput object to output retrieved or scraped data from the bot. Default HitParadeDefaultOutput which pretty prints to the system/io number_scrapers - int number of WebScrapers to launch. Default HitParadeBot.DEFAULT_SCRAPER_COUNT exit_on_exception - bool True if you want the bot to exit on the first exception and False if you want the bot to continue despite the exception. Default is HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION sleep_time - int number of seconds to sleep on a recurring bot. default is HitParadeBot.DEFAULT_SLEEP_TIME hit_parade_command_processor - HitParadeCommandProcessor to run. Default is None """ Thread.__init__(self) self.__dict__ = dict(list(kwargs.items()) + list(self.__dict__.items())) self.scrapers = kwargs.get('scrapers', []) self.cache_manager = kwargs.get('cache_manager', None) self.scraper_type = kwargs.get('scraper_type', HitParadeBot.DEFAULT_SCRAPER_TYPE) self.cache_output_component_func = kwargs.get('cache_output_component_func', None) self.event_subscriptions = dict() self.bot_data = kwargs.get('bot_data', None) if not kwargs.get('bot_data', None) is None else self.bot_data if not self.bot_data is None else dict() self.state_storage_store_prop(prop='start_url', val= kwargs.get('start_url', None) ) self.process = psutil.Process(os.getpid()) self.memory_threshold = kwargs.get('memory_threshold', HitParadeBot.DEFAULT_MEMORY_THRESHOLD) self.recurring = kwargs.get('recurring', True) self.is_init = True self.is_started = False self.timeout = kwargs.get('timeout', HitParadeBot.DEFAULT_TIMEOUT) self.retry = kwargs.get('retry', HitParadeBot.DEFAULT_RETRY) self.add_process_id = kwargs.get( 'add_process_id', HitParadeBot.DEFAULT_ADD_PROCESS_ID ) self.id = MessagingQueue.unique_id(global_id=True, cache_manager=self.cache_manager) self.output_connector = self.cache_output_component_func(type_id=kwargs.get('output', 'HitParadeDefaultOuput' ), **kwargs) #HitParadeFactories.create(type_id=kwargs.get('output', 'HitParadeDefaultOuput' ), **kwargs) self.number_scrapers = kwargs.get('number_scrapers', HitParadeBot.DEFAULT_SCRAPER_COUNT) self._stop_event = threading.Event() self.exit_on_exception = bool(kwargs.get('exit_on_exception', HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION)) if isinstance( kwargs.get('exit_on_exception', HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION), str) else kwargs.get('exit_on_exception', HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION) self.listen_for_urls = bool( kwargs.get('listen_for_urls', HitParadeBot.DEFAULT_LISTEN_FOR_URLS) ) if isinstance( kwargs.get('listen_for_urls', HitParadeBot.DEFAULT_LISTEN_FOR_URLS), str ) else kwargs.get( 'listen_for_urls', HitParadeBot.DEFAULT_LISTEN_FOR_URLS ) self.sleep_time = kwargs.get('sleep_time', HitParadeBot.DEFAULT_SLEEP_TIME) self.recursive_subscribe = kwargs.get('recursive_subscribe', HitParadeBot.DEFAULT_RECURSIVE_SUBSCRIBE) self.hit_parade_command_processor = kwargs.get( 'command_processor', None ) self.subscribable_event = kwargs.get( 'subscribable_event', None ) self.subscribe_event = self.subscribable_event if isinstance(self.subscribable_event, str) else self.subscribable_event.get('event', self.subscribable_event.get('name', None)) self.publish_event = kwargs.get('publish_event' , None ) self.publish_to_event = kwargs.get('publish_to_event', None) self.bot_name = 'HitParadeBot[' + str(self.id) + ']' self.listen_for_urls = kwargs.get( 'listen_for_urls', False )
def run(self): exception_count = 0 while not self.stopped() and ( not self.state_storage_get_prop('exit_on_exception') or (self.state_storage_get_prop('exit_on_exception') and exception_count == 0)): #acquire lock print('<<messaging lock acquire [%s] producer [%s]>>' % (self.id, self.state_storage_get_prop('producer_id'))) self.message_lock.acquire() #read message from cache message = self.next_msg() #check the message if message and not isinstance( message['data'], int) and not message['data'].decode('utf-8').isdigit(): #send message to producer thread MessagingQueue.send_msg( id=self.state_storage_get_prop('producer_id'), direction='in', cmd='SEND', d=message, caller=str(self.id)) #listen to completion message from thread command, obj = MessagingQueue.wait_for_msg(id=self.id, direction='in', caller='Messaging') #release lock print('<<messaging lock release [%s] producer [%s]>>' % (self.id, self.state_storage_get_prop('producer_id'))) self.message_lock.release() print('setting available from id %s for producer %s ' % (self.id, self.state_storage_get_prop('producer_id'))) self.set_available() #sleep time.sleep(self.sleep_time)
def __init__(self, **kwargs): Thread.__init__(self) self.__dict__ = dict( list(kwargs.items()) + list(self.__dict__.items())) self.__dict__[self.id_property] = MessagingQueue.unique_id( global_id=True, cache_manager=self.cache_manager) self.get_state_static_prop = kwargs.get('get_state_static_prop', None) self.store_state_static_prop = kwargs.get('store_state_static_prop', None) self.event_subscriptions = dict() self.subscribe_to_events() self.cache_output_component_func = kwargs.get( 'cache_output_component_func', None) print( '********************************* HitParadeConsumerThread *********************************' )
def __init__(self, **kwargs): Thread.__init__(self) self.__dict__ = dict( list(kwargs.items()) + list(self.__dict__.items())) self.__dict__[self.id_property] = MessagingQueue.unique_id( global_id=True, cache_manager=self.cache_manager) self.get_state_static_prop = kwargs.get('get_state_static_prop', None) self.store_state_static_prop = kwargs.get('store_state_static_prop', None) self.cache_output_component_func = kwargs.get( 'cache_output_component_func', None) self.transaction_processor = HitParadeTransactionManager(**kwargs) self.sleep_time = 30 #autoset to 30 seconds print( '********************************* HitParadeTransactionThread *********************************' )
def bot(**bot_data): hit_parade_scrape_bot = None if bot_data['bot.type'] == 'consumer': hit_parade_scrape_bot = HitParadeConsumerThread(**bot_data) elif bot_data['bot.type'] == 'producer': hit_parade_scrape_bot = HitParadeProducerBot(**bot_data) elif bot_data['bot.type'] == 'transaction': hit_parade_scrape_bot = HitParadeTransactionThread(**bot_data) elif bot_data['bot.type'] == 'publisher': bot_data['publisher_id'] = MessagingQueue.unique_id( global_id=True, cache_manager=bot_data['cache_manager']) hit_parade_scrape_bot = UrlPublisher(**bot_data) else: print('Error bot type %s not recogized' % bot_data['bot.type']) if not hit_parade_scrape_bot is None: print('starting bot...') hit_parade_scrape_bot.start()
def quit(self, id=-1, caller=None, **kwargs): """ Atomic Scraping Method Command that tells the WebScraper to quit and shutdown. This command will force MessagingQueues associated with the id to be destroyed. :param id: int id of the WebScraper :param caller: str caller of the command usually Main Thread :return: [2 values] command, value """ message_dict = dict() message_dict['command'] = 'QUIT' message_dict['type_id'] = 'ScraperAction' message_dict['unique_id'] = MessagingQueue.unique_id( cache_manager=self.cache_manager) message_dict.update(kwargs) return self.run_command(id=id, command='QUIT', d=message_dict, caller=caller)
def __init__(self, **kwargs): """ :param headless: True if you want your browser to run headless and false if you want to see the browser running :param start_url: str start url :param timeout: 5 if you want :param id: int id the web scraper should use. If it is -1, generate a new id from MessagingQueue """ # Initialize the thread Thread.__init__(self) self.__dict__ = dict( list(kwargs.items()) + list(self.__dict__.items())) for k in kwargs.keys(): if kwargs.get(k, None) and isinstance(kwargs.get(k, None), str): print(' %s ==> %s ' % (k, kwargs.get(k, None))) self.cache_manager = kwargs.get('cache_manager', None) self.create_driver() # self.driver.maximize_window() self.timeout = kwargs.get('timeout', None) self.driver.implicitly_wait(self.timeout) self.action = ActionChains(self.driver) if kwargs.get('id', -1) == -1: self.id = MessagingQueue.unique_id( global_id=True, cache_manager=self.cache_manager) print('unique id is now %s ' % str(self.id)) else: self.id = kwargs.get('id', -1) if self.state_storage_get_prop('start_url'): self.set_headless() self.state_storage_store_prop(prop='start_url', val=kwargs.get('start_url', None)) self.default_parser = kwargs.get('default_parser', None) self._stop_event = threading.Event() self.get_external_ip_adresss = kwargs.get('get_external_ip_adresss', None) self.ip = kwargs.get('ip', None) self.get_state_static_prop = kwargs.get('get_state_static_prop', None) self.store_state_static_prop = kwargs.get('store_state_static_prop', None) self.driver.get('https://www.google.com')
def create(type_id=None, **kwargs): kwargs['unique_id'] = MessagingQueue.unique_id( global_id=True, cache_manager=kwargs.get('cache_manager', None)) command_value = kwargs.get('command', None) nocommand = kwargs.get('nocommand', False) if command_value is None or nocommand: if not kwargs.get('default_parser', None) is None and isinstance( kwargs.get('default_parser', None), str): kwargs['default_parser'] = HitParadeFactories.create( kwargs.get('default_parser', 'BeautifulSoupParser'), **kwargs) elif type_id is not None and HitParadeFactories.factory_command_mapping.get( type_id.upper().strip(), None) is not None and HitParadeFactories.FACTORIES.get( type_id, None) is None: HitParadeFactories.importit( HitParadeFactories.factory_command_mapping.get( type_id.upper().strip(), None)) HitParadeFactories.FACTORIES[type_id] = eval( HitParadeFactories.factory_command_mapping.get( type_id.upper().strip(), None) + '.Factory()') HitParadeFactories.FACTORIES[ HitParadeFactories.factory_command_mapping.get( type_id.upper().strip(), None)] = eval( HitParadeFactories.factory_command_mapping.get( type_id.upper().strip(), None) + '.Factory()') elif HitParadeFactories.FACTORIES.get(type_id, None) is None: print('typeid is %s ' % type_id) HitParadeFactories.FACTORIES[type_id] = eval(type_id + '.Factory()') return HitParadeFactories.FACTORIES.get(type_id, None).create(**kwargs) else: if not command_value is None and not kwargs.get( 'command', None) in HitParadeFactories.FACTORIES: HitParadeFactories.FACTORIES[command_value] = eval( HitParadeFactories.command_mapping.get( command_value, None) + '.Factory()') return HitParadeFactories.FACTORIES[command_value].create(**kwargs)
def start_web_scraper( self, start_url=None ): """ Starts web scraper Started as daemon thread. added to the active scrapers. :param start_url: url to open :return: WebScraper that was started """ unique_id = MessagingQueue.unique_id(global_id=True, cache_manager=self.cache_manager) print('thread id is %s ' % unique_id) kwargs_v = dict() kwargs_v['headless'] = self.state_storage_get_prop('start_url') kwargs_v['scraper_type'] = self.scraper_type kwargs_v['timeout'] = self.timeout kwargs_v['start_url'] = self.state_storage_get_prop('start_url') kwargs_v['chrome_binary'] = self.state_storage_get_prop('chrome_binary') kwargs_v['google_chrome_binary'] = self.state_storage_get_prop('google_chrome_binary') kwargs_v['cache_input_file'] = self.state_storage_get_prop('cache_input_file') kwargs_v['cache_manager'] = self.cache_manager kwargs_v['cache_output_component_func'] = self.cache_output_component_func kwargs_v['ip'] = self.ip kwargs_v['get_external_ip_addresss'] = self.get_external_ip_addresss kwargs_v['id'] = unique_id kwargs_v['default_parser'] = self.default_parser for k in self.__dict__.keys(): if 'state_' in k: kwargs_v[k] = self.__dict__[k] s = self.cache_output_component_func(type_id=self.scraper_type, **kwargs_v) self.default_parser.driver = s.driver s.setDaemon(True) self.scrapers.append(s) HitParadeBot.GLOBAL_SCRAPERS.append(s) self.driver = s.driver self.web_driver = s s.start() return s
def run(self): """ Run the thread """ try: if not self.state_storage_get_prop('start_url') is None: self.driver.get(self.state_storage_get_prop('start_url')) except: traceback.print_exc() print('error opening start url of %s ' % self.state_storage_get_prop('start_url')) ERROR_MESSAGE = False QUIT = False print('[%s] Scraper running...' % str(self.get_id())) scraper_component = None last_command = None id_value = self.get_id( ) if not self.get_id() is None else MessagingQueue.unique_id( global_id=True, cache_manager=self.cache_manager) while not (ERROR_MESSAGE or QUIT) and not self.stopped(): print('[%s] Thread message loop ' % str(id_value)) command, obj = MessagingQueue.wait_for_msg(id=self.id, direction='in', caller=str(id_value)) if obj and command: print('<<acquire producer scraper lock>>') obj['driver'] = self.driver obj['get_state_static_prop'] = self.get_state_static_prop obj['store_state_static_prop'] = self.store_state_static_prop obj['id'] = self.id obj['web_driver'] = self type_id_value = obj.get('type_id', None) del obj['type_id'] obj['cache_manager'] = self.cache_manager obj['ip'] = self.ip obj['get_external_ip_addresss'] = self.get_external_ip_adresss print('ip is %s ' % self.ip) obj['open_url'] = False obj['default_parser'] = self.default_parser obj['nocommand'] = True for k in self.__dict__.keys(): if 'storage_' in k: obj[k] = self.__dict__[k] if scraper_component is None: scraper_component = self.cache_manager.cache_output_component_func( type_id=type_id_value, **obj) else: scraper_component.reset(**obj) print('[%s] :: command in %s with message %s ' % (str(self.get_id()), command, str(obj))) if not self.stopped() or command == 'QUIT': print( '[%s] Thread command either thread has been stopped or command is QUIT {%s} ' % (str(self.get_id()), command)) response_object = scraper_component.exec(**obj) if response_object is None: print('response object is none.') print(command) print(obj) self.respond(obj=obj, command=command) else: self.respond(obj=response_object, command=command) else: print('no longer active...quitting...') print('<<release producer scraper lock>>') if not self.stopped(): print('This Web Scraper %s is no longer active. Shutting down. ' % self.id) q = self.quit() print('id[%s] quitting.....[%s]' % (str(self.id), str(q))) MessagingQueue.quit(id=self.get_id(), caller=str(self.get_id()))