Ejemplo n.º 1
0
    def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
        """Create a new SelScraper thread Instance.

        Args:
            captcha_lock: To sync captcha solving (stdin)
            proxy: Optional, if set, use the proxy to route all scrapign through it.
            browser_num: A unique, semantic number for each thread.
        """
        self.search_input = None

        threading.Thread.__init__(self)
        SearchEngineScrape.__init__(self, config, *args, **kwargs)

        self.browser_type = self.config.get('sel_browser', 'chrome').lower()
        self.browser_num = browser_num
        self.captcha_lock = captcha_lock
        self.scrape_method = 'selenium'

        self.xvfb_display = self.config.get('xvfb_display', None)

        self.search_param_values = self._get_search_param_values()

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)
        super().instance_creation_info(self.__class__.__name__)
Ejemplo n.º 2
0
    def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs):
        """Create a new SelScraper thread Instance.

        Args:
            captcha_lock: To sync captcha solving (stdin)
            proxy: Optional, if set, use the proxy to route all scrapign through it.
            browser_num: A unique, semantic number for each thread.
        """
        self.search_input = None

        threading.Thread.__init__(self)
        SearchEngineScrape.__init__(self, *args, **kwargs)

        self.browser_type = Config['SELENIUM'].get('sel_browser', 'chrome').lower()
        self.browser_num = browser_num
        self.captcha_lock = captcha_lock
        self.scrape_method = 'selenium'

        self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None)

        self.search_param_values = self._get_search_param_values()

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method)
        super().instance_creation_info(self.__class__.__name__)
Ejemplo n.º 3
0
    def __init__(self, config, *args, time_offset=0.0, **kwargs):
        """Initialize an HttScrape object to scrape over blocking http.

        HttpScrape inherits from SearchEngineScrape
        and from threading.Timer.
        """
        threading.Timer.__init__(self, time_offset, self.search)
        SearchEngineScrape.__init__(self, config, *args, **kwargs)

        # Bind the requests module to this instance such that each
        # instance may have an own proxy
        self.requests = __import__('requests')

        # initialize the GET parameters for the search request
        self.search_params = {}

        # initialize the HTTP headers of the search request
        # to some base values that mozilla uses with requests.
        # the Host and User-Agent field need to be set additionally.
        self.headers = headers

        # the mode
        self.scrape_method = 'http'

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)

        super().instance_creation_info(self.__class__.__name__)

        if self.search_engine_name == 'blekko':
            logger.critical('blekko does not support http mode.')
            self.startable = False
Ejemplo n.º 4
0
    def __init__(self, config, *args, time_offset=0.0, **kwargs):
        """Initialize an HttScrape object to scrape over blocking http.

        HttpScrape inherits from SearchEngineScrape
        and from threading.Timer.
        """
        threading.Timer.__init__(self, time_offset, self.search)
        SearchEngineScrape.__init__(self, config, *args, **kwargs)

        # Bind the requests module to this instance such that each
        # instance may have an own proxy
        self.requests = __import__('requests')

        # initialize the GET parameters for the search request
        self.search_params = {}

        # initialize the HTTP headers of the search request
        # to some base values that mozilla uses with requests.
        # the Host and User-Agent field need to be set additionally.
        self.headers = headers

        # the mode
        self.scrape_method = 'http'

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(
            self.config, self.search_engine_name, self.scrape_method)

        super().instance_creation_info(self.__class__.__name__)

        if self.search_engine_name == 'blekko':
            logger.critical('blekko does not support http mode.')
            self.startable = False
    def __init__(self,
                 config,
                 query='',
                 page_number=1,
                 search_engine='google',
                 scrape_method='http-async'):
        """
        """
        self.config = config
        self.query = query
        self.page_number = page_number
        self.search_engine_name = search_engine
        self.search_type = 'normal'
        self.scrape_method = scrape_method
        self.requested_at = None
        self.requested_by = 'localhost'
        self.parser = get_parser_by_search_engine(self.search_engine_name)
        self.base_search_url = get_base_search_url_by_search_engine(
            self.config, self.search_engine_name, 'http')
        self.params = get_GET_params_for_search_engine(
            self.query, self.search_engine_name, search_type=self.search_type)
        self.headers = headers
        self.status = 'successful'

        self.num_results_per_page = int(config['num_results_per_page'])
        self.startRecord = self.num_results_per_page * (self.page_number -
                                                        1) + 1
        self.stringStartRecord = "&first=" + str(self.startRecord)
Ejemplo n.º 6
0
def check_detection(config, search_engine_name):
    """
    Checks whether the search engine specified by search_engine_name
    blocked us.
    """
    status = ''
    chromedriver = config.get('chromedriver_path', '/usr/bin/chromedriver')

    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('window-size=1200x600')

    browser = webdriver.Chrome(chrome_options=options,
                               executable_path=chromedriver)

    if search_engine_name == 'google':
        url = get_base_search_url_by_search_engine(config, 'google',
                                                   'selenium')
        browser.get(url)

        def check(browser, status):
            needles = SearchEngineScrape.malicious_request_needles['google']

            if needles['inurl'] in browser.current_url and needles[
                    'inhtml'] in browser.page_source:
                status += 'Google is asking for a captcha! '
                code = 'DETECTED'
            else:
                status += 'No captcha prompt detected. '
                code = 'UNDETECTED'

            return (code, status)

        search_input = None
        try:
            search_input = WebDriverWait(browser, 5).until(
                EC.visibility_of_element_located((By.NAME, 'q')))
            status += 'Got a search input field. '
        except TimeoutException:
            status += 'No search input field located after 5 seconds. '
            return check(browser, status)

        try:
            # random query
            search_input.send_keys('President of Finland' + Keys.ENTER)
            status += 'Google Search successful! '
        except WebDriverException:
            status += 'Cannot make a google search! '
            return check(browser, status)

        return check(browser, status)

    else:
        raise NotImplementedError(
            'Detection check only implemented for Google Right now.')

    browser.quit()

    return status
Ejemplo n.º 7
0
def check_detection(config, search_engine_name):
    """
    Checks whether the search engine specified by search_engine_name
    blocked us.
    """
    status = ''
    chromedriver = config.get('chromedriver_path', '/usr/bin/chromedriver')

    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('window-size=1200x600')

    browser = webdriver.Chrome(chrome_options=options, executable_path=chromedriver)

    if search_engine_name == 'google':
        url = get_base_search_url_by_search_engine(config, 'google', 'selenium')
        browser.get(url)

        def check(browser, status):
            needles = SearchEngineScrape.malicious_request_needles['google']

            if needles['inurl'] in browser.current_url and needles['inhtml'] in browser.page_source:
                status += 'Google is asking for a captcha! '
                code = 'DETECTED'
            else:
                status += 'No captcha prompt detected. '
                code = 'UNDETECTED'

            return (code, status)

        search_input = None
        try:
            search_input = WebDriverWait(browser, 5).until(
                EC.visibility_of_element_located((By.NAME, 'q')))
            status += 'Got a search input field. '
        except TimeoutException:
            status += 'No search input field located after 5 seconds. '
            return check(browser, status)

        try:
            # random query
            search_input.send_keys('President of Finland'+ Keys.ENTER)
            status += 'Google Search successful! '
        except WebDriverException:
            status += 'Cannot make a google search! '
            return check(browser, status)

        return check(browser, status)

    else:
        raise NotImplementedError('Detection check only implemented for Google Right now.')

    browser.quit()

    return status
Ejemplo n.º 8
0
 def __init__(self, query='', page_number=1, search_engine='google', **kwargs):
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = 'http-async'
     self.requested_at = None
     self.requested_by = ''
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name, search_type=self.search_type)
     self.headers = headers
Ejemplo n.º 9
0
 def __init__(self, config, query='', page_number=1, search_engine='google', scrape_method='http-async'):
     """
     """
     self.config = config
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = scrape_method
     self.requested_at = None
     self.requested_by = 'localhost'
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name,
                                                    search_type=self.search_type)
     self.headers = headers
     self.status = 'successful'
Ejemplo n.º 10
0
 def __init__(self, query='', page_number=1, search_engine='google', **kwargs):
     """
     @todo: **kwargs doesn't seem to be used, check if any call to init passes additional keyword args and remove it
     """
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = 'http-async'
     self.requested_at = None
     self.requested_by = 'localhost'
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(self.query, self.search_engine_name,
                                                    search_type=self.search_type)
     self.headers = headers
     self.status = 'successful'
Ejemplo n.º 11
0
 def __init__(self,
              config,
              query='',
              page_number=1,
              search_engine='google',
              scrape_method='http-async'):
     """
     """
     self.config = config
     self.query = query
     self.page_number = page_number
     self.search_engine_name = search_engine
     self.search_type = 'normal'
     self.scrape_method = scrape_method
     self.requested_at = None
     self.requested_by = 'localhost'
     self.parser = get_parser_by_search_engine(self.search_engine_name)
     self.base_search_url = get_base_search_url_by_search_engine(
         self.config, self.search_engine_name, 'http')
     self.params = get_GET_params_for_search_engine(
         self.query, self.search_engine_name, search_type=self.search_type)
     self.headers = headers
     self.status = 'successful'