Example #1
0
    def __init__(self, config, *args, captcha_lock=None, browser_num=1, **kwargs):
        """Create a new SelScraper thread Instance.

        Args:
            captcha_lock: To sync captcha solving (stdin)
            proxy: Optional, if set, use the proxy to route all scrapign through it.
            browser_num: A unique, semantic number for each thread.
        """
        self.search_input = None

        threading.Thread.__init__(self)
        SearchEngineScrape.__init__(self, config, *args, **kwargs)

        self.browser_type = self.config.get('sel_browser', 'chrome').lower()
        self.browser_mode = self.config.get('browser_mode', 'headless').lower()
        self.browser_num = browser_num
        self.captcha_lock = captcha_lock
        self.scrape_method = 'selenium'

        # number of tabs per instance
        self.number_of_tabs = self.config.get('num_tabs', 1)

        self.xvfb_display = self.config.get('xvfb_display', None)

        self.search_param_values = self._get_search_param_values()

        self.user_agent = random_user_agent()

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(self.config, self.search_engine_name, self.scrape_method)
        super().instance_creation_info(self.__class__.__name__)
Example #2
0
    def search(self, rand=True, timeout=15):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """

        success = True

        self.build_search()
        #"tbs=cdr%3A1%2Ccd_min%3A2016%2Ccd_max%3A2015&ei=kdsMWtn0NumD0gLdvYWQDA&"
        '''
        self.headers['User-Agent'] = random_user_agent(only_desktop=True)
        super().detection_prevention_sleep()
        super().keyword_info()
        print self.base_search_url + urlencode(self.search_params)
        sys.exit()
        '''

        if rand:
            self.headers['User-Agent'] = random_user_agent(only_desktop=True)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()

            request = self.requests.get(self.base_search_url +
                                        urlencode(self.search_params),
                                        headers=self.headers,
                                        timeout=timeout)

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            logger.debug(
                '[HTTP - {url}, headers={headers}, params={params}'.format(
                    url=request.url,
                    headers=self.headers,
                    params=self.search_params))

        except self.requests.ConnectionError as ce:
            self.status = 'Network problem occurred {}'.format(ce)
            success = False
        except self.requests.Timeout as te:
            self.status = 'Connection timeout {}'.format(te)
            success = False
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            self.status = 'Stopping scraping because {}'.format(e)
        else:
            if not request.ok:
                self.handle_request_denied(request.status_code)
                success = False

        super().after_search()

        return success
Example #3
0
    def search(self, rand=True, timeout=15):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """

        success = True

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random_user_agent(only_desktop=True)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()

            if self.proxy is not None:
                request = self.requests.get(self.base_search_url + urlencode(self.search_params),
                    headers=self.headers, timeout=timeout,
                    proxies={self.proxy.proto: '{}://{}:{}'.format(self.proxy.proto, self.proxy.host, self.proxy.port)})
            else:
                request = self.requests.get(self.base_search_url + urlencode(self.search_params),
                                            headers=self.headers, timeout=timeout)

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            logger.debug('[HTTP - {url}, headers={headers}, params={params}'.format(
                url=request.url,
                headers=self.headers,
                params=self.search_params))

        except self.requests.ConnectionError as ce:
            self.status = 'Network problem occurred {}'.format(ce)
            logger.error(self.Status)
            success = False
            exit(1)
        except self.requests.Timeout as te:
            self.status = 'Connection timeout {}'.format(te)
            logger.error(self.Status)
            success = False
            exit(1)
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            self.status = 'Stopping scraping because {}'.format(e)
            logger.error(self.Status)
            exit(1)
        else:
            if not request.ok:
                self.handle_request_denied(request.status_code)
                success = False

        super().after_search()

        return success
Example #4
0
    def search(self, rand=True, timeout=30):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """

        success = True
        self.html = ''
        self.status = 'successful'

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random_user_agent(only_desktop=True)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()

            proxy_string = '%s:%s@%s:%s' % (self.proxy.username, self.proxy.password, self.proxy.host, self.proxy.port)
            proxies = dict(http='http://'+proxy_string, https='https://'+proxy_string)
            request = self.requests.get(self.base_search_url + urlencode(self.search_params),
                                        headers=self.headers, timeout=timeout, proxies=proxies)

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            logger.debug('[HTTP - {url}, headers={headers}, params={params}'.format(
                url=request.url,
                headers=self.headers,
                params=self.search_params))

        except self.requests.ConnectionError as ce:
            self.status = 'Network problem occurred {}'.format(ce)
            success = False
        except self.requests.Timeout as te:
            # logger.warning('{name}: request timeout'.format(name=self.name))
            self.status = 'Connection timeout {}'.format(te)
            success = False
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            self.status = 'Stopping scraping because {}'.format(e)
        else:
            if not request.ok:
                self.handle_request_denied(request.status_code)
                success = False

        super().after_search()

        return success
Example #5
0
    def _get_PhantomJS(self):
        try:
            service_args = []

            if self.proxy:
                service_args.extend([
                    '--proxy={}:{}'.format(self.proxy.host, self.proxy.port),
                    '--proxy-type={}'.format(self.proxy.proto),
                ])

                if self.proxy.username and self.proxy.password:
                    service_args.append(
                        '--proxy-auth={}:{}'.format(self.proxy.username, self.proxy.password)
                    )

            dcap = dict(DesiredCapabilities.PHANTOMJS)
            dcap["phantomjs.page.settings.userAgent"] = random_user_agent(only_desktop=True)
            self.webdriver = webdriver.PhantomJS(executable_path=self.config['executable_path'], service_args=service_args, desired_capabilities=dcap)
            return True
        except WebDriverException as e:
            logger.error(e)
        return False
Example #6
0
    def __init__(self,
                 config,
                 *args,
                 captcha_lock=None,
                 browser_num=1,
                 **kwargs):
        """Create a new SelScraper thread Instance.

        Args:
            captcha_lock: To sync captcha solving (stdin)
            proxy: Optional, if set, use the proxy to route all scrapign through it.
            browser_num: A unique, semantic number for each thread.
        """
        self.search_input = None

        threading.Thread.__init__(self)
        SearchEngineScrape.__init__(self, config, *args, **kwargs)

        self.browser_type = self.config.get('sel_browser', 'chrome').lower()
        self.browser_mode = self.config.get('browser_mode', 'headless').lower()
        self.browser_num = browser_num
        self.captcha_lock = captcha_lock
        self.scrape_method = 'selenium'

        # number of tabs per instance
        self.number_of_tabs = self.config.get('num_tabs', 1)

        self.xvfb_display = self.config.get('xvfb_display', None)

        self.search_param_values = self._get_search_param_values()

        self.user_agent = random_user_agent()

        # get the base search url based on the search engine.
        self.base_search_url = get_base_search_url_by_search_engine(
            self.config, self.search_engine_name, self.scrape_method)
        super().instance_creation_info(self.__class__.__name__)
def put_in_database():

    # This block removes the database created in the previous scrape. It was easier to do this
    # than to remove the database code from this software.
    try:
        os.remove("google_scraper.db")
        print("\nSQLite database Removed!")
    except:
        print("\nNo database to remove")

    global counter
    counter += 1
    print("The current count is: ", counter)

    try:
        search = scrape_with_config(config)
    except GoogleSearchError as e:
        print(e)

    # Connect to database
    db = pymysql.connect(
        user=aws_username,  # Username of AWS database
        passwd=aws_password,  # AWS Database password
        host=aws_host,  # AWS Instance
        port=3306,
        database='google_scraper',
        charset='utf8mb4')
    cur = db.cursor()

    target_directory = './images/'
    try:
        os.mkdir(target_directory)
    except FileExistsError:
        pass

    try:
        for serp in search.serps:

            # Create a new record into serp table
            sql = "INSERT INTO serp (search_engine_name, scrape_method, requested_at, search_query) VALUES (%s, %s, %s, %s)"
            cur.execute(sql, (serp.search_engine_name, serp.scrape_method,
                              serp.requested_at, serp.query))

            serp_id = cur.lastrowid
            print("\nserp_id is:  ", serp_id)

            db.commit()

            for link in serp.links:

                if link.has_image:
                    link.has_image = 1
                else:
                    link.has_image = 0

                if link.link_type == 'image_box':

                    height = re.compile('height:(\d+)')
                    h = height.search(link.image_dims)
                    image_height = int(h.group(1))

                    width = re.compile('width:(\d+)')
                    w = width.search(link.image_dims)
                    image_width = int(w.group(1))

                    # IMAGE SAVE LOOP
                    if counter % 24 == 0:
                        print("\nStarting image save loop...:   ")
                        baseURL = 'https://www.google.com'
                        #print("Creating imgURL ...   ")

                        # This `imgURL` link should take you to the Google Image Search result with the image in question "selected" or expanded
                        imgURL = baseURL + link.link
                        print("Creating 'candidate' ...   \n", imgURL)

                        # Create image output file name
                        candidate = serp.query.replace(' ', '_')
                        print(candidate)
                        print("Creating candidate file name ....   ")
                        imageFile = str(serp_id) + '_' + str(
                            link.id) + '_' + candidate + '_' + str(
                                link.rank) + '.jpg'
                        print(imageFile)

                        # Initilize phantomjs headless browser
                        #http://www.marinamele.com/selenium-tutorial-web-scraping-with-selenium-and-python
                        #http://stackoverflow.com/questions/15388057/extract-link-from-xpath-using-selenium-webdriver-and-python
                        dcap = dict(DesiredCapabilities.PHANTOMJS)
                        dcap[
                            "phantomjs.page.settings.userAgent"] = random_user_agent(
                                only_desktop=True)

                        driver = webdriver.PhantomJS('./phantomjs',
                                                     desired_capabilities=dcap)
                        driver.set_window_size(1020, 550)
                        driver.wait = WebDriverWait(driver, 5)

                        # Use the try statement if to continue to collect images even when a collection fails.
                        # If you want scraping to quit when image collection fails, take out the try statement.
                        try:
                            driver.get(imgURL)

                            # CURENT_XPATH freqently changes, so keep an eye on this. If your images suddenly fail to download, this is where to check first.
                            element = driver.wait.until(
                                EC.presence_of_element_located((
                                    By.XPATH,
                                    '//*[@id="irc_cc"]/div[2]/div[1]/div[2]/div[2]/a/img'
                                )))  # this is the current one.
                            time.sleep(4)  #4
                            src = element.get_attribute('src')
                            print("This is the 'src':   ", src)
                            time.sleep(4)  #4

                            # Create direcory named using the current serp_id
                            # There will be one serp_id per keyword search
                            current_directory = str(serp_id) + '/'
                            path = os.path.join(target_directory,
                                                current_directory)
                            if not os.path.exists(path):
                                os.makedirs(path)

                            image_path = path + imageFile

                            with open(os.path.join(path, imageFile),
                                      'wb') as q:
                                # get image from the link
                                res = requests.get(src)
                                # write the image to file in chunks
                                for chunk in res.iter_content(100000):
                                    q.write(chunk)

                        except Exception as e:
                            # Recently image collection has been buggy inconsistenly.
                            # If image download fails, this exception will take a
                            # screenshot and save to the main directory.
                            driver.save_screenshot(imageFile + '.png')
                            pass

                        driver.close()
                        driver.quit()

                    else:
                        image_path = None

                else:
                    image_height = None
                    image_width = None
                    image_path = None

                # Create a new record into search_engine_results table
                sql = "INSERT INTO search_engine_results (link, title, snippet, visible_link, rank, link_type, serp_id, has_image, image_dims, image_height, image_width, news_date, news_source, image_path, top_stories) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
                cur.execute(sql,
                            (link.link, link.title, link.snippet,
                             link.visible_link, link.rank, link.link_type,
                             serp_id, link.has_image, link.image_dims,
                             image_height, image_width, link.news_date,
                             link.news_source, image_path, link.top_stories))

                db.commit()

    finally:
        cur.close()
        db.close()
Example #8
0
    def search(self, rand=True, timeout=15):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """
        import requests
        success = True

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random_user_agent(only_desktop=True)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()
            try:
                rurl = self.base_search_url + urlencode(self.search_params)
            except UnicodeError:
                return False
            if self.search_engine_name == 'google' and self.config.get(
                    'strict'):
                rurl = rurl + '&tbs=li:1'
            proxy_chain_ips = self.config.get('proxy_chain_ips')
            if proxy_chain_ips != "local":
                proxy_chain_ips = proxy_chain_ips.split(',')
            else:
                proxy_chain_ips = []

            proxies_ch = []
            for proxy in proxy_chain_ips:
                proxies_ch.append({
                    'http': 'http://' + proxy,
                    'https': 'http://' + proxy,
                })

            x = 0

            while x < 100:
                if len(proxies_ch):
                    proxies = choice(proxies_ch)
                else:
                    proxies = {}
            # print (proxies)

            #try:
            #print (self.headers)
                session = requests.Session()

                session.trust_env = False
                try:
                    if not self.config.get('check_proxies'):
                        request = session.get(rurl,
                                              headers=self.headers,
                                              timeout=timeout)
                    else:
                        request = session.get(rurl,
                                              proxies=proxies,
                                              headers=self.headers,
                                              timeout=timeout)
                except self.requests.ConnectionError as ce:
                    request.close()
                    continue
                except self.requests.Timeout as te:
                    request.close()
                    continue
                except self.requests.exceptions.RequestException as e:
                    request.close()
                    continue
                logger.warning(str(request.status_code))
                if request.status_code == 200:
                    break
                else:
                    request.close()
                x = x + 1

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            logger.debug(
                '[HTTP - {url}, headers={headers}, params={params}'.format(
                    url=request.url,
                    headers=self.headers,
                    params=self.search_params))

        except self.requests.ConnectionError as ce:
            self.status = 'Network problem occurred {}'.format(ce)
            success = False
        except self.requests.Timeout as te:
            self.status = 'Connection timeout {}'.format(te)
            success = False
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            self.status = 'Stopping scraping because {}'.format(e)
        else:
            if not request.ok:
                self.handle_request_denied(request.status_code)
                success = False

        super().after_search()

        return success
Example #9
0
    def search(self, rand=True, timeout=15):
        """The actual search for the search engine.

        When raising StopScrapingException, the scraper will stop.

        When return False, the scraper tries to continue with next keyword.
        """

        success = True

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random_user_agent(only_desktop=True)

        try:
            super().detection_prevention_sleep()
            super().keyword_info()

            proxy_url = self.format_proxy(self.proxy)

            request = self.requests.get(
                self.base_search_url + urlencode(self.search_params),
                proxies=None if self.proxy is None else {
                    'http': proxy_url,
                    'https': proxy_url
                },
                headers=self.headers,
                timeout=timeout)

            self.requested_at = datetime.datetime.utcnow()
            self.html = request.text

            logger.debug(
                '[HTTP - {url}, headers={headers}, params={params}'.format(
                    url=request.url,
                    headers=self.headers,
                    params=self.search_params))

        except self.requests.ConnectionError as ce:
            self.status = 'Network problem occurred {}'.format(ce)
            success = False
        except self.requests.Timeout as te:
            self.status = 'Connection timeout {}'.format(te)
            success = False
        except self.requests.exceptions.RequestException as e:
            # In case of any http networking exception that wasn't caught
            # in the actual request, just end the worker.
            self.status = 'Stopping scraping because {}'.format(e)
        else:
            if (re.search(r'capt?cha', request.url, re.I)):
                logging.debug('captcha')
                success = False

            if not request.ok:
                self.handle_request_denied(request.status_code)
                success = False

        super().after_search()

        return success