Esempio n. 1
0
    def run_command(self, id=-1, command=None, d=None, caller=None):
        """
        Sends a command to the proper MessagingQueue with the proper data object and command string.
        After a successful send if the object is not processing a QUIT command, the HitParadeBotCommandProcessor then listens to an incoming queue
        where the WebScraper sends a reply as to the success or failure of the command.

        The QUIT command is simply sent and all queues associated with the id are destroyed.
        :param id: int ID of the webscraper.
        :param command: str Command to perform.
        :param d: dict data dictionary of the command request
        :param caller: Thread calling this action usually  the main thread.  Used for logging purposes.
        :return:
        """
        MessagingQueue.send_msg(id=id,
                                direction='in',
                                cmd=command,
                                d=d,
                                caller=caller)
        command = command
        val = None
        if not command == 'QUIT':
            command, val = MessagingQueue.wait_for_msg(direction='out',
                                                       id=id,
                                                       caller=caller)
        print('command %s  ' % command)
        return command, val
Esempio n. 2
0
 def respond(self, obj=None, command=None):
     """
     Sends a message to the main thread.
     :param obj: dict to send to the main thread
     :param command: str command being used.
     :return:
     """
     MessagingQueue.send_msg(id=self.get_id(),
                             direction='out',
                             cmd=command,
                             d=obj,
                             caller=str(self.get_id()))
Esempio n. 3
0
 def scrape_data(self, id=-1, scraping_props=None, caller=None, **kwargs):
     """
     Atomic Scraping Method
     Command that tells a WebScraper to Scrape Data from a URL.
     :param id: int id of the WebScraper to command.
     :param scraping_props: dict all properties to scrape.
     :param caller: str usually main thread
     :return:   [2 values] command, value
     """
     message_dict = dict()
     message_dict['scraper_logins'] = scraping_props.get(
         'scraper_logins', [])
     message_dict['data_selectors'] = scraping_props.get(
         'data_selectors', None)
     message_dict['force_refresh'] = scraping_props.get(
         'force_refresh', False)
     message_dict['web_driver'] = scraping_props.get('web_driver', None)
     message_dict['command'] = 'SCRAPE'
     message_dict['type_id'] = 'Scraper'
     message_dict['unique_id'] = MessagingQueue.unique_id(
         cache_manager=self.cache_manager)
     message_dict.update(kwargs)
     return self.run_command(id=id,
                             command='SCRAPE',
                             d=message_dict,
                             caller=caller)
Esempio n. 4
0
    def login(self,
              id=-1,
              selectors=None,
              login_page_selectors=None,
              login_button_selector=None,
              caller=None,
              **kwargs):
        """
        Atomic Scraping Method
        Sends the login command to a WebScraper object.

        :param id: int id of the WebScraper to send to.
        :param selectors: list[str] list of selectors for username/password
        :param login_page_selectors: list[str] list of selectors to identify if you are on the login page.
        :param login_button_selector: str selector indicating the button to click to login.
        :param caller: str of the calling Thread - usually the Main thread.
        :return:  [2 values] command, value
        """
        message_dict = dict()
        message_dict['selectors'] = selectors
        message_dict['login_page_selectors'] = login_page_selectors
        message_dict['login_button_selector'] = login_button_selector
        message_dict['command'] = 'LOGIN'
        message_dict['type_id'] = 'ScraperLogin'
        message_dict['unique_id'] = MessagingQueue.unique_id(
            cache_manager=self.cache_manager)
        message_dict.update(kwargs)
        return self.run_command(id=id,
                                command='LOGIN',
                                d=message_dict,
                                caller=caller)
Esempio n. 5
0
 def execute_web_command(self, **kwargs):
     if self.command == 'CLICK_SCRAPE':
         # return self.get_result_message(self.action.exec(**kwargs))
         el = self.action.selector
         for i in range(self.action.retry_count):
             if not el is None:
                 try:
                     if isinstance(self.action.selector, str):
                         try:
                             if self.action.wait_for:
                                 while el is None:
                                     el = self.action.driver.find_element_by_css_selector(
                                         self.action.selector)
                                     time.sleep(self.action.time_delay)
                             else:
                                 el = self.action.driver.find_element_by_css_selector(
                                     self.action.selector)
                         except:
                             traceback.print_exc()
                             pass
                         if not el is None:
                             el.click()
                             self.successful = True
                     else:
                         if not el is None:
                             el.click()
                             self.successful = True
                         else:
                             print('element passed in is null.')
                 except:
                     traceback.print_exc()
                     pass
     success_value = not el is None and self.is_successful()
     if success_value:
         MessagingQueue.send_msg_nowait(
             id=self.action.web_driver.id,
             direction='meta',
             cmd='SUCCESS',
             d={'selector': self.action.selector},
             caller='WebScraperComponentScrapeCommand')
     return self.get_result_message(success_value)
Esempio n. 6
0
 def open_url(self, id=-1, url=None, caller=None, **kwargs):
     message_dict = dict()
     message_dict['command'] = 'OPEN'
     message_dict['type_id'] = 'ScraperAction'
     message_dict['scraper_url'] = url
     message_dict['unique_id'] = MessagingQueue.unique_id(
         cache_manager=self.cache_manager)
     message_dict.update(kwargs)
     return self.run_command(id=id,
                             command='OPEN',
                             d=message_dict,
                             caller=caller)
Esempio n. 7
0
 def __init__(self, **kwargs):
     """
     Constructor HitParadeBot which is a Thread
     IS-A Thread
     :param kwargs: The following properties are pulled from kwargs
     scrapers - [] list of active scrapers  default is []
     bot_data - dict of data for the bot to use. default is {}
     scraper_type - FirefoxWebScraper or ChromeWebScraper are the options.  default FireFoxWebScraper
     start_url - url to start with. Default is None
     memory_threshold - memory threshold to which the web scraper will be rebooted if it takes up too much memory.
     recurring - bool True if the Bot will be running in a loop for a certain amount of time, false otherwise.
     headless - bool True if the WebScraper should be headless and False if the WebScraper can be show.  Default is False.
     timeout - int timeout in seconds of a url.  Default  HitParadeBot.DEFAULT_TIMEOUT
     retry - int number of time s to retry a url or task.  Default  HitParadeBot.DEFAULT_RETRY
     output_connector - HitParadeOUtput object to output retrieved or scraped data from the bot. Default HitParadeDefaultOutput which pretty prints to the system/io
     number_scrapers - int number of WebScrapers to launch. Default HitParadeBot.DEFAULT_SCRAPER_COUNT
     exit_on_exception - bool True if you want the bot to exit on the first exception and False if you want the bot to continue despite the exception. Default is  HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION
     sleep_time - int number of seconds to sleep on a recurring bot.  default is HitParadeBot.DEFAULT_SLEEP_TIME
     hit_parade_command_processor - HitParadeCommandProcessor to run.  Default is None
     """
     Thread.__init__(self)
     self.__dict__ = dict(list(kwargs.items()) + list(self.__dict__.items()))
     self.scrapers = kwargs.get('scrapers', [])
     self.cache_manager = kwargs.get('cache_manager', None)
     self.scraper_type = kwargs.get('scraper_type', HitParadeBot.DEFAULT_SCRAPER_TYPE)
     self.cache_output_component_func = kwargs.get('cache_output_component_func', None)
     self.event_subscriptions = dict()
     self.bot_data = kwargs.get('bot_data', None) if not kwargs.get('bot_data', None) is None else self.bot_data if not self.bot_data is None else dict()
     self.state_storage_store_prop(prop='start_url', val= kwargs.get('start_url', None) )
     self.process = psutil.Process(os.getpid())
     self.memory_threshold = kwargs.get('memory_threshold', HitParadeBot.DEFAULT_MEMORY_THRESHOLD)
     self.recurring = kwargs.get('recurring', True)
     self.is_init = True
     self.is_started = False
     self.timeout = kwargs.get('timeout', HitParadeBot.DEFAULT_TIMEOUT)
     self.retry = kwargs.get('retry', HitParadeBot.DEFAULT_RETRY)
     self.add_process_id = kwargs.get( 'add_process_id', HitParadeBot.DEFAULT_ADD_PROCESS_ID )
     self.id = MessagingQueue.unique_id(global_id=True, cache_manager=self.cache_manager)
     self.output_connector = self.cache_output_component_func(type_id=kwargs.get('output', 'HitParadeDefaultOuput' ), **kwargs) #HitParadeFactories.create(type_id=kwargs.get('output', 'HitParadeDefaultOuput' ), **kwargs)
     self.number_scrapers = kwargs.get('number_scrapers', HitParadeBot.DEFAULT_SCRAPER_COUNT)
     self._stop_event = threading.Event()
     self.exit_on_exception = bool(kwargs.get('exit_on_exception', HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION)) if isinstance( kwargs.get('exit_on_exception', HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION), str) else kwargs.get('exit_on_exception', HitParadeBot.DEFAULT_EXIT_ON_EXCEPTION)
     self.listen_for_urls = bool( kwargs.get('listen_for_urls', HitParadeBot.DEFAULT_LISTEN_FOR_URLS) ) if isinstance( kwargs.get('listen_for_urls', HitParadeBot.DEFAULT_LISTEN_FOR_URLS), str ) else kwargs.get( 'listen_for_urls', HitParadeBot.DEFAULT_LISTEN_FOR_URLS )
     self.sleep_time = kwargs.get('sleep_time', HitParadeBot.DEFAULT_SLEEP_TIME)
     self.recursive_subscribe = kwargs.get('recursive_subscribe', HitParadeBot.DEFAULT_RECURSIVE_SUBSCRIBE)
     self.hit_parade_command_processor = kwargs.get( 'command_processor', None )
     self.subscribable_event  = kwargs.get( 'subscribable_event',  None )
     self.subscribe_event = self.subscribable_event if isinstance(self.subscribable_event, str) else self.subscribable_event.get('event', self.subscribable_event.get('name', None))
     self.publish_event = kwargs.get('publish_event' , None )
     self.publish_to_event = kwargs.get('publish_to_event', None)
     self.bot_name = 'HitParadeBot[' + str(self.id) + ']'
     self.listen_for_urls =  kwargs.get( 'listen_for_urls', False )
 def run(self):
     exception_count = 0
     while not self.stopped() and (
             not self.state_storage_get_prop('exit_on_exception') or
         (self.state_storage_get_prop('exit_on_exception')
          and exception_count == 0)):
         #acquire lock
         print('<<messaging lock acquire [%s] producer [%s]>>' %
               (self.id, self.state_storage_get_prop('producer_id')))
         self.message_lock.acquire()
         #read message from cache
         message = self.next_msg()
         #check the message
         if message and not isinstance(
                 message['data'],
                 int) and not message['data'].decode('utf-8').isdigit():
             #send message to producer thread
             MessagingQueue.send_msg(
                 id=self.state_storage_get_prop('producer_id'),
                 direction='in',
                 cmd='SEND',
                 d=message,
                 caller=str(self.id))
             #listen to completion message from thread
             command, obj = MessagingQueue.wait_for_msg(id=self.id,
                                                        direction='in',
                                                        caller='Messaging')
             #release lock
         print('<<messaging lock release [%s] producer [%s]>>' %
               (self.id, self.state_storage_get_prop('producer_id')))
         self.message_lock.release()
         print('setting available from id %s for producer %s ' %
               (self.id, self.state_storage_get_prop('producer_id')))
         self.set_available()
         #sleep
         time.sleep(self.sleep_time)
Esempio n. 9
0
 def __init__(self, **kwargs):
     Thread.__init__(self)
     self.__dict__ = dict(
         list(kwargs.items()) + list(self.__dict__.items()))
     self.__dict__[self.id_property] = MessagingQueue.unique_id(
         global_id=True, cache_manager=self.cache_manager)
     self.get_state_static_prop = kwargs.get('get_state_static_prop', None)
     self.store_state_static_prop = kwargs.get('store_state_static_prop',
                                               None)
     self.event_subscriptions = dict()
     self.subscribe_to_events()
     self.cache_output_component_func = kwargs.get(
         'cache_output_component_func', None)
     print(
         '*********************************  HitParadeConsumerThread  *********************************'
     )
Esempio n. 10
0
 def __init__(self, **kwargs):
     Thread.__init__(self)
     self.__dict__ = dict(
         list(kwargs.items()) + list(self.__dict__.items()))
     self.__dict__[self.id_property] = MessagingQueue.unique_id(
         global_id=True, cache_manager=self.cache_manager)
     self.get_state_static_prop = kwargs.get('get_state_static_prop', None)
     self.store_state_static_prop = kwargs.get('store_state_static_prop',
                                               None)
     self.cache_output_component_func = kwargs.get(
         'cache_output_component_func', None)
     self.transaction_processor = HitParadeTransactionManager(**kwargs)
     self.sleep_time = 30  #autoset to 30 seconds
     print(
         '*********************************  HitParadeTransactionThread  *********************************'
     )
 def bot(**bot_data):
     hit_parade_scrape_bot = None
     if bot_data['bot.type'] == 'consumer':
         hit_parade_scrape_bot = HitParadeConsumerThread(**bot_data)
     elif bot_data['bot.type'] == 'producer':
         hit_parade_scrape_bot = HitParadeProducerBot(**bot_data)
     elif bot_data['bot.type'] == 'transaction':
         hit_parade_scrape_bot = HitParadeTransactionThread(**bot_data)
     elif bot_data['bot.type'] == 'publisher':
         bot_data['publisher_id'] = MessagingQueue.unique_id(
             global_id=True, cache_manager=bot_data['cache_manager'])
         hit_parade_scrape_bot = UrlPublisher(**bot_data)
     else:
         print('Error bot type %s not recogized' % bot_data['bot.type'])
     if not hit_parade_scrape_bot is None:
         print('starting bot...')
         hit_parade_scrape_bot.start()
Esempio n. 12
0
    def quit(self, id=-1, caller=None, **kwargs):
        """
        Atomic Scraping Method
        Command that tells the WebScraper to quit and shutdown.
        This command will force MessagingQueues associated with the id to be destroyed.

        :param id: int id of the WebScraper
        :param caller: str caller of the command usually Main Thread
        :return:   [2 values] command, value
        """
        message_dict = dict()
        message_dict['command'] = 'QUIT'
        message_dict['type_id'] = 'ScraperAction'
        message_dict['unique_id'] = MessagingQueue.unique_id(
            cache_manager=self.cache_manager)
        message_dict.update(kwargs)
        return self.run_command(id=id,
                                command='QUIT',
                                d=message_dict,
                                caller=caller)
Esempio n. 13
0
    def __init__(self, **kwargs):
        """

        :param headless: True if you want your browser to run headless and false if you want to see the browser running
        :param start_url: str start url
        :param timeout: 5 if you want
        :param id: int id the web scraper should use.  If it is -1, generate a new id from MessagingQueue
        """
        # Initialize the thread
        Thread.__init__(self)
        self.__dict__ = dict(
            list(kwargs.items()) + list(self.__dict__.items()))
        for k in kwargs.keys():
            if kwargs.get(k, None) and isinstance(kwargs.get(k, None), str):
                print(' %s ==> %s ' % (k, kwargs.get(k, None)))
        self.cache_manager = kwargs.get('cache_manager', None)
        self.create_driver()
        # self.driver.maximize_window()
        self.timeout = kwargs.get('timeout', None)
        self.driver.implicitly_wait(self.timeout)
        self.action = ActionChains(self.driver)
        if kwargs.get('id', -1) == -1:
            self.id = MessagingQueue.unique_id(
                global_id=True, cache_manager=self.cache_manager)
            print('unique id is now %s ' % str(self.id))
        else:
            self.id = kwargs.get('id', -1)
        if self.state_storage_get_prop('start_url'):
            self.set_headless()
        self.state_storage_store_prop(prop='start_url',
                                      val=kwargs.get('start_url', None))
        self.default_parser = kwargs.get('default_parser', None)
        self._stop_event = threading.Event()
        self.get_external_ip_adresss = kwargs.get('get_external_ip_adresss',
                                                  None)
        self.ip = kwargs.get('ip', None)
        self.get_state_static_prop = kwargs.get('get_state_static_prop', None)
        self.store_state_static_prop = kwargs.get('store_state_static_prop',
                                                  None)
        self.driver.get('https://www.google.com')
Esempio n. 14
0
 def create(type_id=None, **kwargs):
     kwargs['unique_id'] = MessagingQueue.unique_id(
         global_id=True, cache_manager=kwargs.get('cache_manager', None))
     command_value = kwargs.get('command', None)
     nocommand = kwargs.get('nocommand', False)
     if command_value is None or nocommand:
         if not kwargs.get('default_parser', None) is None and isinstance(
                 kwargs.get('default_parser', None), str):
             kwargs['default_parser'] = HitParadeFactories.create(
                 kwargs.get('default_parser', 'BeautifulSoupParser'),
                 **kwargs)
         elif type_id is not None and HitParadeFactories.factory_command_mapping.get(
                 type_id.upper().strip(),
                 None) is not None and HitParadeFactories.FACTORIES.get(
                     type_id, None) is None:
             HitParadeFactories.importit(
                 HitParadeFactories.factory_command_mapping.get(
                     type_id.upper().strip(), None))
             HitParadeFactories.FACTORIES[type_id] = eval(
                 HitParadeFactories.factory_command_mapping.get(
                     type_id.upper().strip(), None) + '.Factory()')
             HitParadeFactories.FACTORIES[
                 HitParadeFactories.factory_command_mapping.get(
                     type_id.upper().strip(), None)] = eval(
                         HitParadeFactories.factory_command_mapping.get(
                             type_id.upper().strip(), None) + '.Factory()')
         elif HitParadeFactories.FACTORIES.get(type_id, None) is None:
             print('typeid is %s ' % type_id)
             HitParadeFactories.FACTORIES[type_id] = eval(type_id +
                                                          '.Factory()')
         return HitParadeFactories.FACTORIES.get(type_id,
                                                 None).create(**kwargs)
     else:
         if not command_value is None and not kwargs.get(
                 'command', None) in HitParadeFactories.FACTORIES:
             HitParadeFactories.FACTORIES[command_value] = eval(
                 HitParadeFactories.command_mapping.get(
                     command_value, None) + '.Factory()')
         return HitParadeFactories.FACTORIES[command_value].create(**kwargs)
Esempio n. 15
0
 def start_web_scraper( self, start_url=None ):
     """
     Starts web scraper
     Started as daemon thread.
     added to the active scrapers.
     :param start_url: url to open
     :return: WebScraper that was started
     """
     unique_id = MessagingQueue.unique_id(global_id=True, cache_manager=self.cache_manager)
     print('thread id is %s ' % unique_id)
     kwargs_v = dict()
     kwargs_v['headless'] =  self.state_storage_get_prop('start_url')
     kwargs_v['scraper_type'] = self.scraper_type
     kwargs_v['timeout'] = self.timeout
     kwargs_v['start_url'] =  self.state_storage_get_prop('start_url')
     kwargs_v['chrome_binary'] =  self.state_storage_get_prop('chrome_binary')
     kwargs_v['google_chrome_binary'] =  self.state_storage_get_prop('google_chrome_binary')
     kwargs_v['cache_input_file'] =  self.state_storage_get_prop('cache_input_file')
     kwargs_v['cache_manager'] = self.cache_manager
     kwargs_v['cache_output_component_func'] = self.cache_output_component_func
     kwargs_v['ip'] = self.ip
     kwargs_v['get_external_ip_addresss'] = self.get_external_ip_addresss
     kwargs_v['id'] = unique_id
     kwargs_v['default_parser'] = self.default_parser
     for k in self.__dict__.keys():
         if 'state_' in k:
             kwargs_v[k] = self.__dict__[k]
     s = self.cache_output_component_func(type_id=self.scraper_type, **kwargs_v)
     self.default_parser.driver = s.driver
     s.setDaemon(True)
     self.scrapers.append(s)
     HitParadeBot.GLOBAL_SCRAPERS.append(s)
     self.driver = s.driver
     self.web_driver = s
     s.start()
     return s
Esempio n. 16
0
    def run(self):
        """
        Run the thread
        """
        try:
            if not self.state_storage_get_prop('start_url') is None:
                self.driver.get(self.state_storage_get_prop('start_url'))
        except:
            traceback.print_exc()
            print('error opening start url of %s ' %
                  self.state_storage_get_prop('start_url'))
        ERROR_MESSAGE = False
        QUIT = False
        print('[%s] Scraper running...' % str(self.get_id()))
        scraper_component = None
        last_command = None
        id_value = self.get_id(
        ) if not self.get_id() is None else MessagingQueue.unique_id(
            global_id=True, cache_manager=self.cache_manager)
        while not (ERROR_MESSAGE or QUIT) and not self.stopped():
            print('[%s] Thread message loop ' % str(id_value))
            command, obj = MessagingQueue.wait_for_msg(id=self.id,
                                                       direction='in',
                                                       caller=str(id_value))
            if obj and command:
                print('<<acquire producer scraper lock>>')
                obj['driver'] = self.driver
                obj['get_state_static_prop'] = self.get_state_static_prop
                obj['store_state_static_prop'] = self.store_state_static_prop
                obj['id'] = self.id
                obj['web_driver'] = self
                type_id_value = obj.get('type_id', None)
                del obj['type_id']
                obj['cache_manager'] = self.cache_manager
                obj['ip'] = self.ip
                obj['get_external_ip_addresss'] = self.get_external_ip_adresss
                print('ip is %s ' % self.ip)
                obj['open_url'] = False
                obj['default_parser'] = self.default_parser
                obj['nocommand'] = True
                for k in self.__dict__.keys():
                    if 'storage_' in k:
                        obj[k] = self.__dict__[k]
                if scraper_component is None:
                    scraper_component = self.cache_manager.cache_output_component_func(
                        type_id=type_id_value, **obj)
                else:
                    scraper_component.reset(**obj)

                print('[%s] :: command in %s with message %s ' %
                      (str(self.get_id()), command, str(obj)))
                if not self.stopped() or command == 'QUIT':
                    print(
                        '[%s] Thread command either thread has been stopped or command is QUIT {%s} '
                        % (str(self.get_id()), command))
                    response_object = scraper_component.exec(**obj)
                    if response_object is None:
                        print('response object is none.')
                        print(command)
                        print(obj)
                        self.respond(obj=obj, command=command)
                    else:
                        self.respond(obj=response_object, command=command)
                else:
                    print('no longer active...quitting...')
            print('<<release producer scraper lock>>')
        if not self.stopped():
            print('This Web Scraper %s is no longer active.  Shutting down. ' %
                  self.id)

        q = self.quit()
        print('id[%s] quitting.....[%s]' % (str(self.id), str(q)))
        MessagingQueue.quit(id=self.get_id(), caller=str(self.get_id()))